def test_stft_istft(self): try: import librosa ds = F.load_digit_wav() name = ds.keys()[0] path = ds[name] y, _ = speech.read(path, pcm=True) hop_length = int(0.01 * 8000) stft = signal.stft(y, n_fft=256, hop_length=hop_length, window='hann') stft_ = librosa.stft(y, n_fft=256, hop_length=hop_length, window='hann') self.assertTrue(np.allclose(stft, stft_.T)) y1 = signal.istft(stft, hop_length=hop_length, window='hann') y2 = librosa.istft(stft_, hop_length=hop_length, window='hann') self.assertTrue(np.allclose(y1, y2)) except ImportError: print("test_stft_istft require librosa.")
def map(self, job): ''' Return ------ [(name, spec, mspec, mfcc, pitch, vad), ...] ''' audio_path, segments = job[0] if len(job) == 1 else job try: # load audio data s, sr_orig = speech.read(audio_path) if sr_orig is not None and self.sr is not None and \ sr_orig != self.sr: raise Exception('Given sample rate (%d Hz) is different from ' 'audio file sample rate (%d Hz).' % (self.sr, sr_orig)) if sr_orig is None: sr_orig = self.sr N = len(s) # processing all segments ret = [] for name, start, end, channel in segments: start = int(float(start) * sr_orig) end = int(N if end <= 0 else float(end) * sr_orig) data = s[start:end, channel] if s.ndim > 1 else s[start:end] with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=UserWarning) features = speech.speech_features( data.ravel(), sr=sr_orig, win=self.win, shift=self.shift, nb_melfilters=self.nb_melfilters, nb_ceps=self.nb_ceps, get_spec=self.get_spec, get_mspec=self.get_mspec, get_mfcc=self.get_mfcc, get_qspec=self.get_qspec, get_phase=self.get_phase, get_pitch=self.get_pitch, get_vad=self.get_vad, get_energy=self.get_energy, get_delta=self.get_delta, pitch_threshold=self.pitch_threshold, pitch_fmax=self.pitch_fmax, vad_smooth=self.vad_smooth, vad_minlen=self.vad_minlen, cqt_bins=self.cqt_bins, fmin=self.fmin, fmax=self.fmax, sr_new=self.sr_new, preemphasis=self.preemphasis, center=self.center) if features is not None: ret.append( (name, [features[i[0]] for i in self.__features_properties])) else: msg = 'Ignore segments: %s, error: NaN values' % name warnings.warn(msg) # return the results as a generator return (i for i in ret) except Exception as e: msg = 'Ignore file: %s, error: %s' % (audio_path, str(e)) import traceback traceback.print_exc() raise e
def transform(self, X, start=0., end=-1, channel=0): """ fs : int original sample frequency of data (in case reading pcm file we don't know the original sample frequency) """ get_mspec = False get_spec = False get_mfcc = False get_pitch = False if 'mspec' in self.feature_type: get_mspec = True if 'spec' in self.feature_type: get_spec = True if 'mfcc' in self.feature_type: get_mfcc = True if 'pitch' in self.feature_type: get_pitch = True # ====== Read audio ====== # if isinstance(X, str) and os.path.exists(X): X, orig_fs = speech.read(X) elif isinstance(X, np.ndarray): orig_fs = None elif isinstance(X, Data): X = X[:] orig_fs = None else: raise ValueError('Cannot process data type %s' % type(X)) # if specifed DEFAULT_FS if orig_fs is None: orig_fs = (SpeechTransform.DEFAULT_FS if SpeechTransform.DEFAULT_FS is not None else self.fs) # ====== check if downsample necessary ====== # if self.fs < orig_fs: # downsample from scikits.samplerate import resample X = resample(X, self.fs / orig_fs, 'sinc_best') elif self.fs > orig_fs: raise ValueError('Cannot perform upsample from frequency: ' '{}Hz to {}Hz'.format(orig_fs, self.fs)) fs = orig_fs if self.fs is None else self.fs # ====== preprocessing ====== # N = len(X) start = int(float(start) * fs) end = int(N if end < 0 else end * fs) X = X[start:end, channel] if X.ndim > 1 else X[start:end] data = speech_features_extraction(X.ravel(), fs=fs, n_filters=self.n_filters, n_ceps=self.n_ceps, win=self.win, shift=self.shift, delta_order=self.delta_order, energy=self.energy, vad=self.vad, dtype='float32', pitch_threshold=self.pitch_threshold, get_pitch=get_pitch, get_spec=get_spec, get_mspec=get_mspec, get_mfcc=get_mfcc) # ====== return results ====== # if data is None: return None results = {} if get_spec: X, sum1, sum2 = data[0] results['spec'] = X if get_mspec: X, sum1, sum2 = data[1] results['mspec'] = X if get_mfcc: X, sum1, sum2 = data[2] results['mfcc'] = X if get_pitch: X, sum1, sum2 = data[3] results['pitch'] = X results = [results[i] for i in self.feature_type] if len(results) == 1: results = results[0] if self.vad: return (results + [data[-1]] if isinstance(results, (tuple, list)) else [results, data[-1]]) return results
return results[0] if len(results) == 1 else results files = [ '/Users/trungnt13/tmp/20051026_180611_340_fsp-b.sph', '/Users/trungnt13/tmp/20051118_212058_553_fsp-b.sph', '/Users/trungnt13/tmp/20110213_140144_892-b.sph', '/Users/trungnt13/tmp/en_4077-a.sph', '/Users/trungnt13/tmp/en_4660-a.sph', '/Users/trungnt13/tmp/en_6402-b.sph', '/Users/trungnt13/tmp/fla_0811-b.sph', '/Users/trungnt13/tmp/fla_0946-b.sph', '/Users/trungnt13/tmp/lre17_vlujcseb.sph' ] for f in files: print(f) s, sr = speech.read(f, remove_dc_offset=True, remove_zeros=True) segs, vad, voices, cut = test_func(s, sr, frame_length=0.02 * sr, maximum_duration=10, minimum_duration=None, return_vad=True, return_cut=True, return_voices=True) for s in segs: print(s.shape, len(s) / sr) plt.figure(figsize=(10, 6)) plt.subplot(4, 1, 1) plt.plot(speech.resample(s, sr, 2000, best_algorithm=False)) plt.subplot(4, 1, 2) plt.plot(vad) plt.subplot(4, 1, 3) plt.plot(voices) plt.subplot(4, 1, 4) plt.plot(cut) plt.show()
import matplotlib matplotlib.use('TkAgg') from matplotlib import pyplot as plt import seaborn import numpy as np import shutil import os from odin import fuel as F, utils from odin.preprocessing import speech from odin import visual datapath = F.load_digit_wav() print(datapath) files = utils.get_all_files(datapath, lambda x: '.wav' in x) y, sr = speech.read(files[0]) print('Raw signal:', y.shape, sr) feat = speech.speech_features(y, sr, win=0.02, shift=0.01, nb_melfilters=40, nb_ceps=13, get_spec=True, get_mspec=True, get_mfcc=True, get_qspec=True, get_phase=True, get_pitch=True, get_vad=True,
return results[0] if len(results) == 1 else results files = [ '/Users/trungnt13/tmp/20051026_180611_340_fsp-b.sph', '/Users/trungnt13/tmp/20051118_212058_553_fsp-b.sph', '/Users/trungnt13/tmp/20110213_140144_892-b.sph', '/Users/trungnt13/tmp/en_4077-a.sph', '/Users/trungnt13/tmp/en_4660-a.sph', '/Users/trungnt13/tmp/en_6402-b.sph', '/Users/trungnt13/tmp/fla_0811-b.sph', '/Users/trungnt13/tmp/fla_0946-b.sph', '/Users/trungnt13/tmp/lre17_vlujcseb.sph' ] for f in files: print(f) s, sr = speech.read(f, remove_dc_offset=True, remove_zeros=True) segs, vad, voices, cut = test_func(s, sr, frame_length=0.02 * sr, maximum_duration=10, minimum_duration=None, return_vad=True, return_cut=True, return_voices=True) for s in segs: print(s.shape, len(s) / sr) plt.figure(figsize=(10, 6)) plt.subplot(4, 1, 1) plt.plot(speech.resample(s, sr, 2000, best_algorithm=False)) plt.subplot(4, 1, 2) plt.plot(vad)
def plot_audio(s, sr=None, win=0.02, shift=0.01, nb_melfilters=40, nb_ceps=12, get_qspec=False, get_vad=True, fmin=64, fmax=None, sr_new=None, preemphasis=0.97, pitch_threshold=0.8, pitch_fmax=1200, vad_smooth=3, vad_minlen=0.1, cqt_bins=96, center=True, title=""): from matplotlib import pyplot as plt from odin.preprocessing import speech # ====== helper ====== # def spectrogram(spec, vad, title): plt.figure() if spec.shape[0] / spec.shape[1] >= 8.: nb_samples = len(spec) n1, n2, n3 = nb_samples // 4, nb_samples // 2, 3 * nb_samples // 4 plt.subplot2grid((3, 4), (0, 0), rowspan=1, colspan=4) plot_spectrogram(spec.T, vad=vad) plt.subplot2grid((3, 4), (1, 0), rowspan=1, colspan=2) plot_spectrogram(spec[:n1].T, vad=vad[:n1]) plt.subplot2grid((3, 4), (1, 2), rowspan=1, colspan=2) plot_spectrogram(spec[n1:n2].T, vad=vad[n1:n2]) plt.subplot2grid((3, 4), (2, 0), rowspan=1, colspan=2) plot_spectrogram(spec[n2:n3].T, vad=vad[n2:n3]) plt.subplot2grid((3, 4), (2, 2), rowspan=1, colspan=2) plot_spectrogram(spec[n3:].T, vad=vad[n3:]) else: plot_spectrogram(spec.T, vad=vad) plt.suptitle(str(title)) plt.tight_layout() # ====== load signal ====== # if isinstance(s, string_types): name = os.path.basename(s) s, _ = speech.read(s) if sr is None: sr = _ else: name = "-".join([str(s.shape), str(sr)]) title = str(title) + ":" + name # ====== processing ====== # get_vad = True if not get_vad else get_vad y = speech.speech_features(s, sr, win=win, shift=shift, nb_melfilters=nb_melfilters, nb_ceps=nb_ceps, get_spec=True, get_mspec=True, get_mfcc=True, get_qspec=get_qspec, get_phase=False, get_pitch=False, get_vad=get_vad, get_energy=True, get_delta=False, fmin=fmin, fmax=fmax, sr_new=sr_new, preemphasis=preemphasis, pitch_threshold=pitch_threshold, pitch_fmax=pitch_fmax, vad_smooth=vad_smooth, vad_minlen=vad_minlen, cqt_bins=cqt_bins, center=center) # ====== plot raw signals ====== # if sr > 16000: s = speech.resample(s, sr, 16000) sr = 16000 plt.figure() plt.subplot(2, 1, 1) plt.plot(s) plt.subplot(2, 1, 2) plt.plot(y['energy'].ravel()) plt.tight_layout() plt.suptitle(title) # ====== plot spectrogram ====== # spectrogram(y['spec'], y['vad'], title='STFT power spectrum') if get_qspec: spectrogram(y['qspec'], y['vad'], title='CQT power spectrum') return y