def mashability(song1, song2): """ Returns how well song1 transitions into song2 using cosine matrix similarity and FFT semitone bin approximation matrices """ # If the tempo differs by more than thirty then we should never make that transition if abs(song1.bpm - song2.bpm) > 30: return 1 sample_length = MIX_LENGTH # beats per sample beats1 = song1.AudioFile.analysis.beats[song1.mix_out : song1.mix_out + sample_length] beats2 = song2.AudioFile.analysis.beats[song1.mix_in : song1.mix_in + sample_length] data1 = audio.getpieces(song1.AudioFile, beats1) data2 = audio.getpieces(song2.AudioFile, beats2) data1.encode("temp1.mp3") data2.encode("temp2.mp3") y1, sr1 = librosa.load("temp1.mp3") y2, sr2 = librosa.load("temp2.mp3") S1 = np.abs(librosa.stft(y1, n_fft=4096)) chroma1 = librosa.feature.chroma_stft(S=S1, sr=sr1) S2 = np.abs(librosa.stft(y2, n_fft=4096)) chroma2 = librosa.feature.chroma_stft(S=S2, sr=sr2) # im = librosa.display.specshow(chroma1,x_axis = "time",y_axis = "chroma") # im2 = librosa.display.specshow(chroma2,x_axis = "time",y_axis = "chroma") # plt.show() orthogonal_arr = [] for i in range(min(chroma1.shape[1], chroma2.shape[1])): orthogonal_arr.append(dst.cosine(chroma1[:, i], chroma2[:, i])) return sum(orthogonal_arr) / len(orthogonal_arr)
def reverse_channel(a, b, n_fft=2**13, win_length=2**12, hop_length=2**10): ''' Estimates the channel distortion in b relative to a and reverses it :parameters: - a : np.ndarray Some signal - b : np.ndarray Some other signal with channel distortion relative to a - n_fft : int Number of samples in each FFT computation, default 2**13 - win_length : int Number of samples in each window, default 2**12 - hop_length : int Number of samples between successive FFT computations, default 2**10 :returns: - b_filtered : np.ndarray The signal b, filtered to reduce channel distortion ''' # Compute spectrograms a_spec = librosa.stft(a, n_fft=n_fft, win_length=win_length, hop_length=hop_length) b_spec = librosa.stft(b, n_fft=n_fft, win_length=win_length, hop_length=hop_length) # Compute the best filter H = best_filter_coefficients(a_spec, b_spec) # Apply it in the frequency domain (ignoring aliasing! Yikes) b_spec_filtered = H*b_spec # Get back to time domain b_filtered = librosa.istft(b_spec_filtered, win_length=win_length, hop_length=hop_length) return b_filtered
def test_stft_bad_window(): y = np.zeros(22050 * 5) n_fft = 2048 window = np.ones(n_fft // 2) librosa.stft(y, n_fft=n_fft, window=window)
def getSpectra(self, n_fft = 4096, hop_length = 1024): if self.spectra == None: stft = librosa.stft(self.signal, n_fft, hop_length) self.spectra = Spectra(stft, self.sr, n_fft, hop_length) elif self.spectra.n_fft != n_fft or self.spectra.hop_length != hop_length: stft = librosa.stft(self.signal, n_fft, hop_length) return Spectra(stft, self.sr, n_fft, hop_length) return self.spectra
def hpss(y): D = librosa.stft(y) H, P = librosa.decompose.hpss(D, kernel_size=KERNEL_SIZE, power=HPSS_P) D_harm = np.abs(librosa.stft(librosa.istft(H), n_fft=N_FFT, hop_length=HOP)) D_perc = np.abs(librosa.stft(librosa.istft(P), n_fft=N_FFT, hop_length=HOP)) return D_harm, D_perc
def __test_stft(center, pad_mode): D1 = librosa.stft(y, center=center, pad_mode='reflect') D2 = librosa.stft(y, center=center, pad_mode=pad_mode) assert D1.shape == D2.shape if center and pad_mode != 'reflect': assert not np.allclose(D1, D2) else: assert np.allclose(D1, D2)
def SaveSpectrogram(y_mix, y_inst,y_vocal, filename, orig_sr=44100) : y_mix = librosa.core.resample(y_mix,orig_sr,SR) y_vocal = librosa.core.resample(y_vocal,orig_sr,SR) y_inst = librosa.core.resample(y_inst,orig_sr,SR) S_mix = np.abs(librosa.stft(y_mix,n_fft=window_size,hop_length=hop_length)).astype(np.float32) S_inst = np.abs(librosa.stft(y_inst,n_fft=window_size,hop_length=hop_length)).astype(np.float32) S_vocal = np.abs(librosa.stft(y_vocal,n_fft=window_size,hop_length=hop_length)).astype(np.float32) norm = S_mix.max() S_mix /= norm S_inst /= norm S_vocal /= norm np.savez(os.path.join('./Spectrogram',filename+'.npz'),mix=S_mix,inst=S_inst ,vocal=S_vocal)
def wavs_to_specs(wavs_mono, wavs_src1, wavs_src2, n_fft = 1024, hop_length = None): stfts_mono = list() stfts_src1 = list() stfts_src2 = list() for wav_mono, wav_src1, wav_src2 in zip(wavs_mono, wavs_src1, wavs_src2): stft_mono = librosa.stft(wav_mono, n_fft = n_fft, hop_length = hop_length) stft_src1 = librosa.stft(wav_src1, n_fft = n_fft, hop_length = hop_length) stft_src2 = librosa.stft(wav_src2, n_fft = n_fft, hop_length = hop_length) stfts_mono.append(stft_mono) stfts_src1.append(stft_src1) stfts_src2.append(stft_src2) return stfts_mono, stfts_src1, stfts_src2
def midi_to_cqt(midi, sf2_path=None, fs=22050, hop=512): ''' Feature extraction routine for midi data, converts to a drum-free, percussion-suppressed CQT. Input: midi - pretty_midi.PrettyMIDI object sf2_path - path to .sf2 file to pass to pretty_midi.fluidsynth fs - sampling rate to synthesize audio at, default 22050 hop - hop length for cqt, default 512 Output: midi_gram - Simulated CQT of the midi data ''' # Synthesize the MIDI using the supplied sf2 path midi_audio = midi.fluidsynth(fs=fs, sf2_path=sf2_path) # Use the harmonic part of the signal H, P = librosa.decompose.hpss(librosa.stft(midi_audio)) midi_audio_harmonic = librosa.istft(H) # Compute log frequency spectrogram of audio synthesized from MIDI midi_gram = np.abs(librosa.cqt(y=midi_audio_harmonic, sr=fs, hop_length=hop, fmin=librosa.midi_to_hz(36), n_bins=60, tuning=0.0))**2 return midi_gram
def amplitude_for_file(audio_path): y, sr = librosa.load(audio_path) # from http://bmcfee.github.io/librosa/librosa.html#librosa.core.logamplitude # Get a power spectrogram from a waveform y S = np.abs(librosa.stft(y)) ** 2 log_S = librosa.logamplitude(S) return log_S
def audio_to_cqt_and_onset_strength(audio, fs=22050, hop=512): ''' Feature extraction for audio data. Gets a power CQT of harmonic component and onset strength signal of percussive. Input: midi - pretty_midi.PrettyMIDI object fs - sampling rate to synthesize audio at, default 22050 hop - hop length for cqt, default 512, onset strength hop will be 1/4 of this Output: audio_gram - CQT of audio data audio_onset_strength - onset strength signal ''' # Use harmonic part for gram, percussive part for onsets H, P = librosa.decompose.hpss(librosa.stft(audio)) audio_harmonic = librosa.istft(H) audio_percussive = librosa.istft(P) # Compute log-frequency spectrogram of original audio audio_gram = np.abs(librosa.cqt(y=audio_harmonic, sr=fs, hop_length=hop, fmin=librosa.midi_to_hz(36), n_bins=60))**2 # Beat track the audio file at 4x the hop rate audio_onset_strength = librosa.onset.onset_strength(audio_percussive, hop_length=hop/4, sr=fs) return audio_gram, audio_onset_strength
def test_piptrack(): def __test(S, freq): pitches, mags = librosa.piptrack(S=S, fmin=100) idx = (mags > 0) assert len(idx) > 0 recovered_pitches = pitches[idx] # We should be within one cent of the target assert np.all(np.abs(np.log2(recovered_pitches) - np.log2(freq)) <= 1e-2) sr = 22050 duration = 3.0 for freq in [110, 220, 440, 880]: # Generate a sine tone y = np.sin(2 * np.pi * freq * np.linspace(0, duration, num=duration*sr)) for n_fft in [1024, 2048, 4096]: # Using left-aligned frames eliminates reflection artifacts at the boundaries S = np.abs(librosa.stft(y, n_fft=n_fft, center=False)) yield __test, S, freq
def parse_audio(path, audio_conf, windows, normalize=False): ''' Input: path : string 导入音频的路径 audio_conf : dict 求频谱的音频参数 windows : dict 加窗类型 Output: spect : FloatTensor 每帧的频谱 ''' y = load_audio(path) n_fft = int(audio_conf['sample_rate']*audio_conf["window_size"]) win_length = n_fft hop_length = int(audio_conf['sample_rate']*audio_conf['window_stride']) window = windows[audio_conf['window']] D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window) spect, phase = librosa.magphase(D) spect = torch.FloatTensor(spect) spect = spect.log1p() if normalize: mean = spect.mean() std = spect.std() spect.add_(-mean) spect.div_(std) return spect.transpose(0,1)
def parse_audio(self, audio_path): if self.augment: y = load_randomly_augmented_audio(audio_path, self.sample_rate) else: y = load_audio(audio_path) if self.noiseInjector: add_noise = np.random.binomial(1, self.noise_prob) if add_noise: y = self.noiseInjector.inject_noise(y) n_fft = int(self.sample_rate * self.window_size) win_length = n_fft hop_length = int(self.sample_rate * self.window_stride) # STFT D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=self.window) spect, phase = librosa.magphase(D) # S = log(S+1) spect = np.log1p(spect) spect = torch.FloatTensor(spect) if self.normalize: mean = spect.mean() std = spect.std() spect.add_(-mean) spect.div_(std) return spect
def test_piptrack_properties(): def __test(S, n_fft, hop_length, fmin, fmax, threshold): pitches, mags = librosa.core.piptrack(S=S, n_fft=n_fft, hop_length=hop_length, fmin=fmin, fmax=fmax, threshold=threshold) # Shape tests eq_(S.shape, pitches.shape) eq_(S.shape, mags.shape) # Make sure all magnitudes are positive assert np.all(mags >= 0) # Check the frequency estimates for bins with non-zero magnitude idx = (mags > 0) assert np.all(pitches[idx] >= fmin) assert np.all(pitches[idx] <= fmax) # And everywhere else, pitch should be 0 assert np.all(pitches[~idx] == 0) y, sr = librosa.load('data/test1_22050.wav') for n_fft in [2048, 4096]: for hop_length in [None, n_fft // 4, n_fft // 2]: S = np.abs(librosa.stft(y, n_fft=n_fft, hop_length=hop_length)) for fmin in [0, 100]: for fmax in [4000, 8000, sr // 2]: for threshold in [0.1, 0.2, 0.5]: yield __test, S, n_fft, hop_length, fmin, fmax, threshold
def stretch_demo(input_file, output_file, speed): '''Phase-vocoder time stretch demo function. :parameters: - input_file : str path to input audio - output_file : str path to save output (wav) - speed : float > 0 speed up by this factor ''' N_FFT = 2048 HOP_LENGTH = N_FFT /4 # 1. Load the wav file, resample print 'Loading ', input_file y, sr = librosa.load(input_file) # 2. generate STFT @ 2048 samples print 'Computing short-time fourier transform... ' D = librosa.stft(y, n_fft=N_FFT, hop_length=HOP_LENGTH) print 'Playing back at %3.f%% speed' % (speed * 100) D_stretch = librosa.phase_vocoder(D, speed, hop_length=HOP_LENGTH) y_stretch = librosa.istft(D_stretch, hop_length=HOP_LENGTH) print 'Saving stretched audio to: ', output_file librosa.output.write_wav(output_file, y_stretch, sr)
def compute_beat_mls(filename, beat_times, mel_bands=num_mel_bands, fft_size=1024, hop_size=512): """ Compute average Mel log spectrogram per beat given previously extracted beat times. :param filename: path to audio file :param beat_times: list of beat times in seconds :param mel_bands: number of Mel bands :param fft_size: FFT size :param hop_size: hop size for FFT processing :return: beat Mel spectrogram (mel_bands x frames) """ y, sr = librosa.load(filename, sr=22050, mono=True) spec = np.abs(librosa.stft(y=y, n_fft=fft_size, hop_length=hop_size, win_length=fft_size, window=scipy.signal.hamming)) mel_fb = librosa.filters.mel(sr=22050, n_fft=fft_size, n_mels=mel_bands, fmin=50, fmax=10000, htk=True) s = np.sum(mel_fb, axis=1) mel_fb = np.divide(mel_fb, s[:, np.newaxis]) mel_spec = np.dot(mel_fb, spec) mel_spec = np.log10(1. + 1000. * mel_spec) beat_frames = np.round(beat_times * (22050. / hop_size)) beat_melspec = np.max(mel_spec[:, beat_frames[0]:beat_frames[1]], axis=1) for k in xrange(1, beat_frames.shape[0]-1): beat_melspec = np.column_stack((beat_melspec, np.max(mel_spec[:, beat_frames[k]:beat_frames[k+1]], axis=1))) return beat_melspec
def __test_consistency(frame_length, hop_length, center): y, sr = librosa.load(__EXAMPLE_FILE, sr=None) # Ensure audio is divisible into frame size. y = librosa.util.fix_length(y, y.size - y.size % frame_length) assert y.size % frame_length == 0 # STFT magnitudes with a constant windowing function and no centering. S = librosa.magphase(librosa.stft(y, n_fft=frame_length, hop_length=hop_length, window=np.ones, center=center))[0] # Try both RMS methods. rms1 = librosa.feature.rms(S=S, frame_length=frame_length, hop_length=hop_length) rms2 = librosa.feature.rms(y=y, frame_length=frame_length, hop_length=hop_length, center=center) assert rms1.shape == rms2.shape # Normalize envelopes. rms1 /= rms1.max() rms2 /= rms2.max() # Ensure results are similar. np.testing.assert_allclose(rms1, rms2, rtol=5e-2)
def get_spectrograms(sound_file): '''Extracts melspectrogram and log magnitude from given `sound_file`. Args: sound_file: A string. Full path of a sound file. Returns: Transposed S: A 2d array. A transposed melspectrogram with shape of (T, n_mels) Transposed magnitude: A 2d array.Has shape of (T, 1+hp.n_fft//2) ''' # Loading sound file y, sr = librosa.load(sound_file, sr=hp.sr) # or set sr to hp.sr. # stft. D: (1+n_fft//2, T) D = librosa.stft(y=y, n_fft=hp.n_fft, hop_length=hp.hop_length, win_length=hp.win_length) # magnitude spectrogram magnitude = np.abs(D) #(1+n_fft/2, T) # power spectrogram power = magnitude**2 #(1+n_fft/2, T) # mel spectrogram S = librosa.feature.melspectrogram(S=power, n_mels=hp.n_mels) #(n_mels, T) return np.transpose(S.astype(np.float32)), np.transpose(magnitude.astype(np.float32)) # (T, n_mels), (T, 1+n_fft/2)
def getChromagram(self, n_fft = 4096, hop_length = 1024): if self.chromagram == None: if self.spectra == None: stft = librosa.stft(self.signal, n_fft, hop_length) self.spectra = Spectra(stft, self.sr, n_fft, hop_length) self.chromagram = librosa.feature.chromagram(S=self.spectra.getMagnitude()) return self.chromagram
def mfcc_clustering(file_name, n_clusters): """ From Prem :return: """ clusterer = KMeans(n_clusters=n_clusters) print(file_name) mix, sr = librosa.load(file_name) mix_stft = librosa.stft(mix) comps, acts = find_template(mix_stft, sr, 100, 101, 0, mix_stft.shape[1]) cluster_comps = librosa.feature.mfcc(S=comps)[1:14] save_mfcc_img(file_name[:-4] + "_mfcc.png", np.flipud(cluster_comps)) clusterer.fit_transform(cluster_comps.T) labels = clusterer.labels_ # print(labels) sources = [] for cluster_index in range(n_clusters): indices = np.where(labels == cluster_index)[0] template, residual = extract_template(comps[:, indices], mix_stft) t = librosa.istft(template) sources.append(t) return np.array(sources)
def find_peaks(self, n_fft=2048, tau=10, kappa=10): ''' Extracts a fingerprint from the loaded audio data. Stores value internally as a integer hash code. ''' # compute stft if self._audio_data is None: raise ValueError("No audio data loaded.") S = librosa.stft(self._audio_data, n_fft) # find peaks n_frames = S.shape[0] n_bins = S.shape[1] peaks = [] for n in range(n_frames): for k in range(n_bins): p = S[n][k] is_peak = True # search neigborhood for i in range(tau): for j in range(kappa): n_ = n + i - tau//2 k_ = k + j - kappa//2 if 0 <= n_ < n_frames and 0 < k_ < n_bins and (n != n_ and k != n_): p_ = S[n_][k_] if abs(p_) < abs(p): is_peak = False if is_peak: peaks.append((n, k)) return peaks
def percussive(y): '''Extract the percussive component of an audio time series''' D = librosa.stft(y) P = librosa.decompose.hpss(D)[1] return librosa.istft(P)
def extract_features(audio_file: Path, seconds: int = params.nsynth_max_seconds, window_size: int = params.librosa_spec_windows, hop_length: int = params.librosa_hop_length, calc_chroma_stft=True, calc_mfcc_stft=True, calc_mfcc=True): audio_data, sr = librosa.load(audio_file, sr=None) if seconds: audio_data = audio_data[:seconds * sr] stft = np.abs(librosa.stft( audio_data, hop_length=hop_length)) if calc_chroma_stft or calc_mfcc_stft else None chroma_stft = librosa.feature.chroma_stft( S=stft, sr=sr, n_chroma=window_size, hop_length=hop_length) if calc_chroma_stft else None mfcc_stft = librosa.feature.mfcc( audio_data, S=stft, sr=sr, n_mfcc=window_size, hop_length=hop_length) if calc_mfcc_stft else None mfcc = librosa.feature.mfcc( audio_data, sr=sr, n_mfcc=window_size, hop_length=hop_length) if calc_mfcc else None return (chroma_stft, mfcc_stft, mfcc)
def makeSpectragrams(filename): f, sr = librosa.load(filename) print "first" melSpectra = librosa.feature.melspectrogram(f) cqtSpectra = librosa.cqt(f) stftSpectra = librosa.stft(f) print "stuff" librosa.display.specshow(melSpectra) # plt.specgram(melSpectra) imageName = filename, "MelSpectragram.png" title = "Mel Spectrogram \nof " + filename[26:] plt.title(title) plt.ion() # plt.savefig(imageName) plt.show() librosa.display.specshow(cqtSpectra) title = "Constant Q Spectrogram \nof " + filename[26:] plt.title(title) # plt.spectrogram(cqtSpectra) plt.show() librosa.display.specshow(stftSpectra) title = "STFT Spectrogram \nof " + filename[26:] plt.title(title) # plt.spectrogram(cqtSpectra) plt.show() return True
def get_feature(fname): #b,_ = librosa.load(fname, res_type = 'kaiser_fast') b,_ = librosa.load(fname, res_type = 'kaiser_fast') try: mfcc = np.mean(librosa.feature.mfcc(y = b,n_mfcc=60).T,axis=0) mels = np.mean(librosa.feature.melspectrogram(b, sr = SAMPLE_RATE).T,axis = 0) stft = np.abs(librosa.stft(b)) chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr = SAMPLE_RATE).T,axis = 0) contrast=np.mean(librosa.feature.spectral_contrast(S=stft, sr=SAMPLE_RATE).T,axis=0) tonnetz=np.mean(librosa.feature.tonnetz(librosa.effects.harmonic(b), sr = SAMPLE_RATE).T,axis = 0) ft2 = librosa.feature.zero_crossing_rate(b)[0] ft3 = librosa.feature.spectral_rolloff(b)[0] ft4 = librosa.feature.spectral_centroid(b)[0] ft5 = librosa.feature.spectral_contrast(b)[0] ft6 = librosa.feature.spectral_bandwidth(b)[0] ft2_trunc = np.hstack([np.mean(ft2),np.std(ft2), skew(ft2), np.max(ft2), np.min(ft2)]) ft3_trunc = np.hstack([np.mean(ft3),np.std(ft3), skew(ft3), np.max(ft3), np.min(ft3)]) ft4_trunc = np.hstack([np.mean(ft4),np.std(ft4), skew(ft4), np.max(ft4), np.min(ft4)]) ft5_trunc = np.hstack([np.mean(ft5),np.std(ft5), skew(ft5), np.max(ft5), np.min(ft5)]) ft6_trunc = np.hstack([np.mean(ft6),np.std(ft6), skew(ft6), np.max(ft6), np.min(ft6)]) return pd.Series(np.hstack((mfcc,mels,chroma,contrast,tonnetz,ft2_trunc, ft3_trunc, ft4_trunc, ft5_trunc, ft6_trunc))) #d = np.hstack([mfcc,mels,chroma,contrast,tonnetz,ft2_trunc,ft3_trunc,ft4_trunc,ft5_trunc,ft6_trunc]) #features = np.empty((0,238)) #d = np.vstack([features,d]) except: print('bad file') return pd.Series([0]*238)
def parse_audio(path, audio_conf, windows, normalize=True): ''' Input: path : string 导入音频的路径 audio_conf : dcit 求频谱的音频参数 windows : dict 加窗类型 Output: spect : ndarray 每帧的频谱 ''' y = load_audio(path) n_fft = int(audio_conf['sample_rate']*audio_conf["window_size"]) win_length = n_fft hop_length = int(audio_conf['sample_rate']*audio_conf['window_stride']) window = windows[audio_conf['window']] D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window) spect, phase = librosa.magphase(D) spect = np.log1p(spect) if normalize: mean = spect.mean() std = spect.std() spect = np.add(spect, -mean) spect = np.divide(spect, std) return spect.transpose()
def melspectrogram(y, sr=22050, n_fft=256, hop_length=128, **kwargs): """Compute a mel spectrogram from a time series Arguments: y -- (ndarray) audio time-series sr -- (int) sampling rate of y | default: 22050 n_fft -- (int) number of FFT components | default: 256 hop_length -- (int) frames to hop | default: 128 **kwargs -- Mel filterbank parameters See melfb() documentation for details. Returns S: S -- (ndarray) Mel spectrogram """ # Compute the STFT powspec = np.abs(librosa.stft(y, n_fft = n_fft, hann_w = n_fft, hop_length = hop_length))**2 # Build a Mel filter mel_basis = melfb(sr, n_fft, **kwargs) # Remove everything past the nyquist frequency mel_basis = mel_basis[:, :(n_fft/ 2 + 1)] return np.dot(mel_basis, powspec)
def compute_cqt(filename): a, sr = librosa.load(filename, sr=SR) spectrum = librosa.stft(a) harm_spec, _ = librosa.decompose.hpss(spectrum) harm = librosa.istft(harm_spec) cqt = np.abs(librosa.cqt(harm, sr=sr, hop_length=HOP, real=False)) return cqt
def test_real_hpss(): # Load an audio signal y, sr = librosa.load('data/test1_22050.wav') D = np.abs(librosa.stft(y)) def __hpss_test(window, power, mask, margin): H, P = librosa.decompose.hpss(D, kernel_size=window, power=power, mask=mask, margin=margin) if margin == 1.0 or margin == (1.0, 1.0): if mask: assert np.allclose(H + P, np.ones_like(D)) else: assert np.allclose(H + P, D) else: if mask: assert not np.any(H.astype(bool) & P.astype(bool)) else: assert np.all(H + P <= D) for window in [31, (5, 5)]: for power in [1, 2, 10]: for mask in [False, True]: for margin in [1.0, 3.0, (1.0, 1.0), (9.0, 10.0)]: yield __hpss_test, window, power, mask, margin
def local_audio_3d(self, filepath, mode="waveform"): """ Converts a local audio file into a 3D model. Args: filepath: string containing the path to the audio file mode: musical parameter to be used to create the 3D model options: waveform, stft """ print "Loading audio file " + filepath waveform, sr = librosa.load(filepath) if mode == "waveform": # time domain analysis # Downsample waveform and store positive values only if len(waveform) > 1000: m = len(str(len(waveform))) downsample_factor = (10 ** (m-1-3) * int(str(len(waveform))[0])) # 1k (i.e. 10^3) magnitude else: downsample_factor = 1 half_waveform = [waveform[i] for i in xrange(len(waveform)) if waveform[i]>0 and i%downsample_factor==0] # Reshape and rescale waveform processed_waveform = self._movingaverage(half_waveform, self.ma_window_size) # processed_waveform = self._limit_spikes(half_waveform, np.mean(half_waveform), 5) processed_waveform = self._rescale_list(processed_waveform, 0, self.height_Y) processed_waveform = self.make_waveform_square(processed_waveform, self.n_waveform_bars) # make waveform "square" # Convert 2D waveform into 3D print "Creating 3D model" model_3d = self.make_waveform_3d(processed_waveform, self.height_Z) else: # frequency domain analysis self.mask_val /= 100 # Get STFT magnitude print "Analyzing frequency components" stft = librosa.stft(waveform, n_fft=256) stft, phase = librosa.magphase(stft) # Downsample and rescale STFT if len(stft[0]) > 1000: m = len(str(len(stft[0]))) downsample_factor = (10 ** (m-1-3)) * int(str(len(stft[0]))[0]) # 1k (i.e. 10^3) magnitude else: downsample_factor = 1 new_stft = [] for curr_fft in stft: min_loudness_value = max(curr_fft) ds_fft = [curr_fft[j] + min_loudness_value for j in xrange(len(curr_fft)) if j%downsample_factor==0] ds_fft = self._rescale_list(ds_fft, self.min_absolute_value, self.height_Z) new_stft.append(ds_fft) print "Creating 3D model" model_3d = np.array(new_stft) print "Exporting the 3D file" if self.OUTPUT_FOLDER[-1] != '/': self.OUTPUT_FOLDER.append('/') output_filename = self.OUTPUT_FOLDER + filepath.split('/')[-1][:-4] + "_" + mode + ".stl" numpy2stl(model_3d, output_filename, scale=self.scale, mask_val=self.mask_val, solid=True)
def plot_signal(idx, data): if len(idx) == 0: raise PreventUpdate figs = make_subplots(rows=2, cols=1, subplot_titles=('Waveform', 'Spectrogram')) try: filename = data[idx[0]]['audio_filepath'] audio, fs = librosa.load(filename, sr=None) time_stride = 0.01 hop_length = int(fs * time_stride) n_fft = 512 # linear scale spectrogram s = librosa.stft(y=audio, n_fft=n_fft, hop_length=hop_length) s_db = librosa.power_to_db(np.abs(s)**2, ref=np.max, top_db=100) figs.add_trace( go.Scatter( x=np.arange(audio.shape[0]) / fs, y=audio, line={'color': 'green'}, name='Waveform', hovertemplate= 'Time: %{x:.2f} s<br>Amplitude: %{y:.2f}<br><extra></extra>', ), row=1, col=1, ) figs.add_trace( go.Heatmap( z=s_db, colorscale=[ [0, 'rgb(30,62,62)'], [0.5, 'rgb(30,128,128)'], [1, 'rgb(30,255,30)'], ], colorbar=dict(yanchor='middle', lenmode='fraction', y=0.2, len=0.5, ticksuffix=' dB'), dx=time_stride, dy=fs / n_fft / 1000, name='Spectrogram', hovertemplate= 'Time: %{x:.2f} s<br>Frequency: %{y:.2f} kHz<br>Magnitude: %{z:.2f} dB<extra></extra>', ), row=2, col=1, ) figs.update_layout({ 'margin': dict(l=0, r=0, t=20, b=0, pad=0), 'height': 500 }) figs.update_xaxes(title_text='Time, s', row=1, col=1) figs.update_yaxes(title_text='Amplitude', row=1, col=1) figs.update_xaxes(title_text='Time, s', row=2, col=1) figs.update_yaxes(title_text='Frequency, kHz', row=2, col=1) except Exception: pass return figs
""" Created on Fri Mar 8 08:41:08 2019 @author: MR toad """ import librosa import os import numpy as np import librosa.display import matplotlib.pyplot as plt #from PIL import Image file_path = 'E:/speech/' mfcc_path = 'E:/mfcc/' pic_path = 'E:/pic' file_name_list = os.listdir(file_path) for file_name in file_name_list: y, sr = librosa.load(file_path + file_name) mfcc_feature = librosa.feature.mfcc(y=y, sr=sr) np.save(mfcc_path + file_name.split('.')[0] + ".npy", mfcc_feature) plt.figure(figsize=(12, 8)) D = librosa.amplitude_to_db(librosa.stft(y), ref=np.max) plt.subplot(4, 2, 1) librosa.display.specshow(D, y_axis='linear') plt.colorbar(format='%+2.0f dB') plt.title('Linear-frequency power spectrogram') plt.savefig(file_name.split('.')[0] + ".png", dpi=300)
def _stft(self, x): return librosa.stft(x, n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length)
def GenerateLibrosaFeatures(wav_path): y, sr = librosa.load(wav_path) stft = librosa.stft(y, n_fft=BUFFER_LENGTH, hop_length=HOP_LENGTH) D = np.abs(stft)**2 S = np.log(librosa.feature.melspectrogram(S=D, n_mels=MEL_COUNT)) return S
for chromagram_i in range(12): data[i,50+chromagram_i] = chromagram_mean[chromagram_i] data[i,62+chromagram_i] = chromagram_var[chromagram_i] #plt.figure(figsize=(16, 6)) #librosa.display.specshow(chromagram, x_axis='time', y_axis='chroma', hop_length=hop_length, cmap='coolwarm') #---------------------- #Fourier Transform(傅里叶变换) # Default FFT window size n_fft = 2048 # FFT window size hop_length = 512 # number audio of frames between STFT columns (looks like a good default) # Short-time Fourier transform (STFT) D = np.abs(librosa.stft(audio_file, n_fft = n_fft, hop_length = hop_length)) #振幅D=(f,t) frequent_weights = D.sum(axis=1) #各频率权重时间和(行为频率列为时间) frequent_list = librosa.fft_frequencies(sr=sr, n_fft = n_fft)/sr #print(np.shape(frequent_list)) #print(np.shape(frequent_weight)) #print(frequent_list) #print(frequent_weight) chroma_stft_mean = np.average(frequent_list,weights=frequent_weights) chroma_stft_var = sum(((frequent_list-chroma_stft_mean)*frequent_weights)**2)/sum(frequent_weights) #print(chroma_stft_mean) #print(chroma_stft_var) data[i,74] = chroma_stft_mean data[i,75] = chroma_stft_var print(data)
def get_spec(wav, n_fft=1024, window="hamming", hop_length=256): return librosa.stft(wav, window=window, n_fft=n_fft, hop_length=hop_length)
def stft_fn(y): return librosa.stft(y=y, n_fft=int(frame_size), hop_length=hop_size, center=False).T
def specgram_lbrs(audiopath: str, plotpath: str=None, name: str=None, cmap: str='gray_r', algorithm='default', y_axis=None, **kwargs): """ Generates a spectrogram of an audio file using librosa.display.specshow function. The output will be in png format. :param audiopath: string Path of the audio file. :param plotpath: string Path to plot the spectrogram. Default to the current working directory. :param name: string Name of the output image. Default to audio name. :param cmap: string Automatic colormap detection See matplotlib.pyplot.pcolormesh. :type algorithm: str or callable :param algorithm: Algorithm to use to compute the spectrogram. Available algorithms: 'default', 'mel', 'log'. The 'default' mode uses the librosa.stft function to compute the spectrogram data, the 'mel' uses the librosa.feature.melspectrogram function and 'log' uses librosa.cqt function. Expected return type of the algorithm: np.ndarray [shape=(Any, t)] :param y_axis: None or str. Range for the y-axes. This parameter is passed to the librosa.display.specshow function. :param kwargs: Additional kwargs are passed on to the defined algorithm function. """ if plotpath is not None and not os.path.isdir(plotpath): os.makedirs(plotpath) if algorithm not in ['mel', 'log', 'default'] or not callable: raise ValueError('Unrecognized scale or not a callable object.') # Load audio and convert it to mono y, sr = librosa.load(audiopath) y = librosa.core.to_mono(y) # Apply algorithm to obtain an array of spectrogram data if algorithm == 'default': spec_data = librosa.stft(y, **kwargs) # Convert the data spectrogram to decibel units spec_data = librosa.power_to_db(librosa.magphase(spec_data, power=2)[0], ref=np.max) elif algorithm == 'mel': kwargs.setdefault('n_mels', 128) kwargs.setdefault('fmax', 8000) spec_data = librosa.feature.melspectrogram(y=y, sr=sr, **kwargs) # Convert the data spectrogram to decibel units spec_data = librosa.power_to_db(spec_data, ref=np.max) elif algorithm == 'log': spec_data = librosa.cqt(y, sr, **kwargs) # Convert the data spectrogram to decibel units spec_data = librosa.power_to_db(librosa.magphase(spec_data, power=2)[0], ref=np.max) else: spec_data = algorithm(y=y, sr=sr, **kwargs) # Plot spectrogram fig = plt.figure(figsize=FIG_SIZE) librosa.display.specshow(spec_data, sr=sr, cmap=cmap, y_axis=y_axis) plt.axis('off') fig.subplots_adjust(left=0, right=1, bottom=0, top=1) if name is None: name = audiopath.split('/')[-1] if plotpath is not None: plt.savefig(plotpath + '/' + name + '.png') else: plt.savefig(name + '.png') plt.close()
def to_spec(wav, len_frame=ModelConfig.L_FRAME, len_hop=ModelConfig.L_HOP): return librosa.stft(wav, n_fft=len_frame, hop_length=len_hop)
def features(X, sample_rate): stft = np.abs(librosa.stft(X)) # fmin 和 fmax 对应于人类语音的最小最大基本频率 pitches, magnitudes = librosa.piptrack(X, sr=sample_rate, S=stft, fmin=70, fmax=400) pitch = [] for i in range(magnitudes.shape[1]): index = magnitudes[:, 1].argmax() pitch.append(pitches[index, i]) pitch_tuning_offset = librosa.pitch_tuning(pitches) pitchmean = np.mean(pitch) pitchstd = np.std(pitch) pitchmax = np.max(pitch) pitchmin = np.min(pitch) # 频谱质心 cent = librosa.feature.spectral_centroid(y=X, sr=sample_rate) cent = cent / np.sum(cent) meancent = np.mean(cent) stdcent = np.std(cent) maxcent = np.max(cent) # 谱平面 flatness = np.mean(librosa.feature.spectral_flatness(y=X)) # 使用系数为50的MFCC特征 mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=50).T, axis=0) mfccsstd = np.std(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=50).T, axis=0) mfccmax = np.max(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=50).T, axis=0) # 色谱图 chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0) # 梅尔频率 mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T, axis=0) # ottava对比 contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T, axis=0) # 过零率 zerocr = np.mean(librosa.feature.zero_crossing_rate(X)) S, phase = librosa.magphase(stft) meanMagnitude = np.mean(S) stdMagnitude = np.std(S) maxMagnitude = np.max(S) # 均方根能量 rmse = librosa.feature.rmse(S=S)[0] meanrms = np.mean(rmse) stdrms = np.std(rmse) maxrms = np.max(rmse) ext_features = np.array([ flatness, zerocr, meanMagnitude, maxMagnitude, meancent, stdcent, maxcent, stdMagnitude, pitchmean, pitchmax, pitchstd, pitch_tuning_offset, meanrms, maxrms, stdrms ]) ext_features = np.concatenate( (ext_features, mfccs, mfccsstd, mfccmax, chroma, mel, contrast)) return ext_features
plt.xlabel("Frequency") plt.ylabel("Magnitude") plt.title("Power spectrum") # STFT -> spectrogram hop_length = 512 # in num. of samples n_fft = 2048 # window in num. of samples # calculate duration hop length and window in seconds hop_length_duration = float(hop_length) / sample_rate n_fft_duration = float(n_fft) / sample_rate print("STFT hop length duration is: {}s".format(hop_length_duration)) print("STFT window duration is: {}s".format(n_fft_duration)) # perform stft stft = librosa.stft(signal, n_fft=n_fft, hop_length=hop_length) # calculate abs values on complex numbers to get magnitude spectrogram = np.abs(stft) # display spectrogram plt.figure(figsize=FIG_SIZE) librosa.display.specshow(spectrogram, sr=sample_rate, hop_length=hop_length) plt.xlabel("Time") plt.ylabel("Frequency") plt.colorbar() plt.title("Spectrogram") # apply logarithm to cast amplitude to Decibels log_spectrogram = librosa.amplitude_to_db(spectrogram) plt.figure(figsize=FIG_SIZE)
for j in range( len(info1)): #Get the files in the folder inside each id #print(j) path1 = path + '/' + info1[j] # Path of the info1 folder info2 = os.listdir(path1) # Get the audio files for k in range(len(info2)): # For each audio files path2 = path1 + '/' + info2[k] sig, rate = librosa.load(path2, duration=3.0) #rate, sig = wav.read(path2) # Read .wav signal sig = scipy.signal.medfilt(sig, kernel_size=None) # Filtering mfcc1 = librosa.feature.mfcc( sig, rate, n_mfcc=13) #, hop_length=800, n_fft=1600) #print(mfcc1.shape) librosa_stft = np.abs(librosa.stft(sig)) cent = librosa.feature.spectral_centroid(y=sig, sr=rate) #print(cent.shape) flat = librosa.feature.spectral_flatness(y=sig) #print(flat.shape) rolloff = librosa.feature.spectral_rolloff(y=sig, sr=rate, roll_percent=0.99) #print(rolloff.shape) zcr = librosa.feature.zero_crossing_rate(sig) #print(zcr.shape) Tot_feat = np.vstack( (mfcc1, librosa_stft, cent, flat, rolloff, zcr)) #Tot_feat = speechpy.processing.cmvn(Tot_feat,variance_normalization=True)
def separate_melody_accompaniment(x, Fs, N, H, traj, n_harmonics=10, tol_cent=50): """F0-based melody-accompaniement separation Notebook: C8/C8S2_MelodyExtractSep.ipynb Args: x: Audio signal Fs: Sampling frequency N: Window size in samples H: Hopsize in samples traj: F0 traj (time in seconds in 1st column, frequency in Hz in 2nd column) n_harmonics: Number of harmonics tol_cent: Tolerance in cents Returns: x_mel: Reconstructed audio signal for melody x_acc: Reconstructed audio signal for accompaniement """ # Compute STFT X = librosa.stft(x, n_fft=N, hop_length=H, win_length=N, pad_mode='constant') Fs_feature = Fs / H T_coef = np.arange(X.shape[1]) / Fs_feature freq_res = Fs / N F_coef = np.arange(X.shape[0]) * freq_res # Adjust trajectory traj_X_values = interp1d(traj[:, 0], traj[:, 1], kind='nearest', fill_value='extrapolate')(T_coef) traj_X = np.hstack((T_coef[:, None], traj_X_values[:, None, ])) # Compute binary masks mask_mel = convert_trajectory_to_mask_cent(traj_X, F_coef, n_harmonics=n_harmonics, tol_cent=tol_cent) mask_acc = np.ones(mask_mel.shape) - mask_mel # Compute masked STFTs X_mel = X * mask_mel X_acc = X * mask_acc # Reconstruct signals x_mel = librosa.istft(X_mel, hop_length=H, win_length=N, window='hann', center=True, length=x.size) x_acc = librosa.istft(X_acc, hop_length=H, win_length=N, window='hann', center=True, length=x.size) return x_mel, x_acc
def extract_IMU_sound_video(video_IMU_data): features = np.empty((0, Data_window_watch, 12)) labels = [] features_sound = np.empty((0, 193)) features_video = np.empty((0, Data_window_video, 64, 64, 3)) for i in range(len(video_IMU_data)): print(i) #Saving features after every 100 samples if (i + 1) % 100 == 0: features_video = features_video.astype('uint8') data = [features, labels, features_sound, features_video] with open('Data_train_' + str(i) + 'ser.pkl', 'wb') as f: pickle.dump(data, f) features = np.empty((0, Data_window_watch, 12)) labels = [] features_sound = np.empty((0, 193)) features_video = np.empty((0, Data_window_video, 64, 64, 3)) #print(video_IMU_data[i][0]) V_name = video_IMU_data[i][0] V_type = video_IMU_data[i][1] V_stime = video_IMU_data[i][2] V_etime = video_IMU_data[i][3] #print(V_name,V_etime-V_stime) duration = V_etime - V_stime #Sound Features Sound_X = video_IMU_data[i][6] Sound_sample_rate_expected = video_IMU_data[i][5] #print('Leng of sound:',len(Sound_X)) sound_len = len(Sound_X) sound_sampling = (sound_len * 1000) / duration #Duration is in milli seconds #Sound windows below sound_win = [] start = 0 end = start + sound_sampling * TimeWindow end = int(end) while end <= sound_len: winx = Sound_X[start:end] stft = np.abs(librosa.stft(winx)) mfccs = np.mean(librosa.feature.mfcc(y=winx, sr=Sound_sample_rate_expected, n_mfcc=40).T, axis=0) chroma = np.mean(librosa.feature.chroma_stft( S=stft, sr=Sound_sample_rate_expected).T, axis=0) mel = np.mean(librosa.feature.melspectrogram( winx, sr=Sound_sample_rate_expected).T, axis=0) contrast = np.mean(librosa.feature.spectral_contrast( S=stft, sr=Sound_sample_rate_expected).T, axis=0) tonnetz = np.mean(librosa.feature.tonnetz( y=librosa.effects.harmonic(winx), sr=Sound_sample_rate_expected).T, axis=0) #print (start,end,len(winx)) ext_features = np.hstack([mfccs, chroma, mel, contrast, tonnetz]) sound_win.append(ext_features) start = start + sound_sampling * TimeWindow_slide start = int(start) end = start + sound_sampling * TimeWindow end = int(end) #Video Features and Windows video_len = video_IMU_data[i][7] #print(video_len,duration) video_sampling = (video_len * 1000) / duration video_raw = [] start = 0 end = start + video_sampling * TimeWindow end = int(end) #print('Video Sampling Rate:',video_sampling) #print('Start is:',start,': End is:',end) # cap = cv2.VideoCapture(V_name) # video_continue=True # while video_continue: # video_continue, img = cap.read() # #video_raw.append(cv2.resize(img, (img_size, img_size))) # video_raw.append(cv2.resize(img, (img_size, img_size))) # # video_raw.append(img) cap = cv2.VideoCapture(V_name) while True: ret, img = cap.read() if not ret: break video_raw.append( cv2.resize(img, (img_size, img_size)).astype('uint8')) Windows_video = [] while end <= video_len: #print('Start is:',start,': End is:',end) win_video = video_raw[start:end] win_video = win_video[:Data_window_video] Windows_video.append(win_video) start = start + video_sampling * TimeWindow_slide start = int(start) end = start + video_sampling * TimeWindow end = int(end) #Dict of sensor data V_dict = dict() # Loop over the data items for j in range(len(video_IMU_data[i][4])): #print(video_IMU_data[i][4][j]) S_type = video_IMU_data[i][4][j][0] timestamp = video_IMU_data[i][4][j][2] x = video_IMU_data[i][4][j][3] y = video_IMU_data[i][4][j][4] z = video_IMU_data[i][4][j][5] if S_type in V_dict: V_dict[S_type].append([timestamp, x, y, z]) else: V_dict[S_type] = [] V_dict[S_type].append([timestamp, x, y, z]) #print(S_type,timestamp,x,y,z) # 3 = Acc Right Wrist # 4 = GYRO Right Wrist # 11 = Acc left Wrist # 12 = GYRO left Wrist raw3 = [] raw4 = [] raw11 = [] raw12 = [] for sid in V_dict: if sid == 'raw3': #print(sid,len(V_dict[sid])) # Timewindow 2 Sec, Sampling 25HZ. total_samples = len(V_dict[sid]) sampling = (total_samples * 1000) / (duration) #print('Sampling',sampling) start = 0 end = start + sampling * TimeWindow end = int(end) while end <= total_samples: datawind = V_dict[sid][start:end] #Fixing Datawind_leng Here: datawind = datawind[:Data_window_watch] raw3.append(datawind) #print('window Samples:',len(datawind)) #winlen1.append(len(datawind)) #print(start,end) start = start + (sampling) * (TimeWindow_slide) start = int(start) end = start + sampling * TimeWindow end = int(end) if sid == 'raw4': #print(sid,len(V_dict[sid])) # Timewindow 2 Sec, Sampling 25HZ. total_samples = len(V_dict[sid]) sampling = (total_samples * 1000) / (duration) #print('Sampling',sampling) start = 0 end = start + sampling * TimeWindow end = int(end) while end <= total_samples: datawind = V_dict[sid][start:end] #print('window Samples:',len(datawind)) #winlen2.append(len(datawind)) datawind = datawind[:Data_window_watch] raw4.append(datawind) #print(start,end) start = start + (sampling) * (TimeWindow_slide) start = int(start) end = start + sampling * TimeWindow end = int(end) if sid == 'raw11': #print(sid,len(V_dict[sid])) # Timewindow 2 Sec, Sampling 25HZ. total_samples = len(V_dict[sid]) sampling = (total_samples * 1000) / (duration) #print('Sampling',sampling) start = 0 end = start + sampling * TimeWindow end = int(end) while end <= total_samples: datawind = V_dict[sid][start:end] #print('window Samples:',len(datawind)) #winlen3.append(len(datawind)) datawind = datawind[:Data_window_watch] raw11.append(datawind) #print(start,end) start = start + (sampling) * (TimeWindow_slide) start = int(start) end = start + sampling * TimeWindow end = int(end) if sid == 'raw12': #print(sid,len(V_dict[sid])) # Timewindow 2 Sec, Sampling 25HZ. total_samples = len(V_dict[sid]) sampling = (total_samples * 1000) / (duration) #print('Sampling',sampling) start = 0 end = start + sampling * TimeWindow end = int(end) while end <= total_samples: datawind = V_dict[sid][start:end] datawind = datawind[:Data_window_watch] raw12.append(datawind) #print('window Samples:',len(datawind)) #winlen4.append(len(datawind)) #print(start,end) start = start + (sampling) * (TimeWindow_slide) start = int(start) end = start + sampling * TimeWindow end = int(end) #print(sid,len(V_dict[sid])) #print(len(raw3),len(raw4),len(raw11),len(raw12)) raw3 = np.array(raw3) raw4 = np.array(raw4) raw11 = np.array(raw11) raw12 = np.array(raw12) sound_win = np.array(sound_win) Windows_video = np.array(Windows_video) # We abandon the timestamp #print(raw3.shape,raw4.shape,raw11.shape,raw12.shape) raw3_windows = raw3.shape[0] raw4_windows = raw4.shape[0] raw11_windows = raw11.shape[0] raw12_windows = raw12.shape[0] #print('sensor wind:',raw12_windows) sound_win_windows = sound_win.shape[0] video_win_windows = Windows_video.shape[0] #print('Sound wind:',sound_win_windows) #print('Video wind:',video_win_windows) try: #Sometimes Watch have less features, and we don't know the Reason min_features_watch = np.array( [raw3.shape[1], raw4.shape[1], raw11.shape[1], raw12.shape[1]]).min() if min_features_watch < Data_window_watch: print('Skipping:', raw3.shape, raw4.shape, raw11.shape, raw12.shape) continue min_features_Video = Windows_video.shape[1] if min_features_Video < Data_window_video: print('Skipping:', Windows_video.shape) continue except: print('Error Skipping:', raw3.shape, raw4.shape, raw11.shape, raw12.shape) continue #print(sound_win_windows) min_windows = np.array([ raw3_windows, raw4_windows, raw11_windows, raw12_windows, sound_win_windows, video_win_windows ]).min() if min_windows > 0: output = np.concatenate( (raw3[:min_windows, :, 1:4], raw4[:min_windows, :, 1:4], raw11[:min_windows, :, 1:4], raw12[:min_windows, :, 1:4]), axis=2) #print(output.shape) features = np.vstack([features, output]) # print(sound_win[:min_windows].shape) features_sound = np.vstack( [features_sound, sound_win[:min_windows]]) #print(Windows_video[:min_windows].shape, features_video.shape) features_video = np.vstack( [features_video, Windows_video[:min_windows]]) #output.shape[0],V_type for k in range(output.shape[0]): labels.append(V_type) # print(raw4.shape) # print(output.shape) # print(features.shape, features_sound.shape, Windows_video[:min_windows].shape ) # break # 12-num of windows, 47-num of samples in window 12= 3*4 num of sensor reading types features_video = features_video.astype('uint8') data = [features, labels, features_sound, features_video] with open('Data_train_' + str(i) + '.pkl', 'wb') as f: pickle.dump(data, f) print('Saved: ' + 'Data_train_' + str(i) + '.pkl') print(features_video.shape) return features, labels, features_sound, features_video
def stft(y): return librosa.stft( y=y, n_fft=hp.n_fft, hop_length=hp.hop_length, win_length=hp.win_length)
energy.append(np.mean(e)) ent = 0.0 m = np.mean(e) for j in range(0,len(e[0])): q = np.absolute(e[0][j] - m) ent = ent + (q * np.log10(q)) entropy_of_energy.append(ent) f_list_1 = [] f_list_1.append(zero_crossings) f_list_1.append(energy) f_list_1.append(entropy_of_energy) f_np_1 = np.array(f_list_1) f_np_1 = np.transpose(f_np_1)[:-1] kmeans = KMeans(n_clusters=2, random_state=0).fit(f_np_1) result=kmeans.predict(f_np_1) D = li.amplitude_to_db(np.abs(li.stft(y)), ref=np.max) plt.subplot(3,1,1) plt.title("Audio Analog Signal") plt.plot(y[1950:2000]) plt.subplot(3,1,2) plt.title("Spectogram") librosa.display.specshow(D, y_axis='linear') plt.colorbar(format='%+2.0f dB') plt.subplot(3,1,3) plt.title("Audio Digital Signal") plt.plot(result, marker='d', color='blue', drawstyle='steps') plt.show() stream.stop_stream() stream.close() audio.terminate()
def LoadAudio(file_path): y, sr = load(file_path, sr=SR) stft = librosa.stft(y, n_fft=window_size, hop_length=hop_length) mag, phase = librosa.magphase(stft) return mag.astype(np.float32), phase
x = np.reshape(x, (-1, N_CHANNELS)) if np.array_equal(x.T, a_content): print("equal") else: print("not equal") print("content = ", a_content) print("x = ", x) diff = a_content - x.T print("diff = ", diff) a = np.zeros_like(a_content) a[:N_CHANNELS, :] = np.exp(x.T) - 1 p = 2 * np.pi * np.random.random_sample(a.shape) - np.pi for i in range(500): S = a * np.exp(1j * p) x = librosa.istft(S) p = np.angle(librosa.stft(x, N_FFT)) OUTPUT_FILENAME = 'outputs/' + CONTENT_FILENAME[: -4] + '_' + STYLE_FILENAME[:-4] + '_ctw-' + str( CONTENT_WEIGHT ) + '_stw-' + str( STYLE_WEIGHT ) + '_iter-' + str( ITERATIONS) + '.wav' librosa.output.write_wav(OUTPUT_FILENAME, x, fs) print(OUTPUT_FILENAME) print("done")
my_dpi = 120 for index, row in speakers_filtered.iterrows(): dir_ = root + '/' + row['SUBSET'] + '/' + str(row['ID']) + '/' print('working on df row {}, spaker {}'.format(index, row['CODE'])) if not os.path.exists(dir_): print('dir {} not exists, skipping'.format(dir_)) continue files_iter = Path(dir_).glob('**/*.flac') files_ = [str(f) for f in files_iter] for f in files_: ay, sr = librosa.load(f) duration = ay.shape[0] / sr start = 0 while start + 5 < duration: slice_ = ay[start * sr:(start + 5) * sr] start = start + 5 - 1 x = librosa.stft(slice_) xdb = librosa.amplitude_to_db(abs(x)) plt.figure(figsize=(227 / my_dpi, 227 / my_dpi), dpi=my_dpi) plt.axis('off') librosa.display.specshow(xdb, sr=sr, x_axis='time', y_axis='log') plt.savefig(root + '/train-gram/' + str(row['CODE']) + '/' + uuid.uuid4().hex + '.png', dpi=my_dpi) plt.close() print('work done on index {}, speaker {}'.format(index, row['CODE']))
import numpy as np import librosa import librosa.display import matplotlib.pyplot as plt # Get the file path to the included audio example filename = 'Train/blues/blues.00000.au' # Load the example clip y, sr = librosa.load(filename) # Compute spectral centroid sc = librosa.feature.spectral_centroid(y=y, sr=sr) # Compute spectrogram S, phase = librosa.magphase(librosa.stft(y=y)) librosa.feature.spectral_centroid(S=S) # Plot the result plt.figure() plt.subplot(2, 1, 1) plt.semilogy(sc.T, label='Spectral centroid') plt.ylabel('Hz') plt.xticks([]) plt.xlim([0, sc.shape[-1]]) plt.legend() plt.subplot(2, 1, 2) librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.max), y_axis='log', x_axis='time') plt.title('log Power spectrogram')
def read_audio_spectum(filename): x, fs = librosa.load(filename) S = librosa.stft(x, N_FFT) p = np.angle(S) S = np.log1p(np.abs(S[:, :430])) return S, fs
def getStft(y, n_fft=2048, hop_length=512): return librosa.feature.rmse(S=librosa.stft(y, n_fft=n_fft, hop_length=hop_length))[0]
def display_sample_info(file_path, label=''): """Generate various representations a given audio file. E.g. Mel, MFCC and power spectrogram's. Args: file_path (str): Path to the audio file. label (str): Optional label to display for the given audio file. Returns: Nothing. """ if not os.path.isfile(file_path): raise ValueError('{} does not exist.'.format(file_path)) # By default, all audio is mixed to mono and resampled to 22050 Hz at load time. y, sr = librosa.load(file_path, sr=None, mono=True) # At 16000 Hz, 512 samples ~= 32ms. At 16000 Hz, 200 samples = 12ms. 16 samples = 1ms @ 16kHz. hop_length = 200 # Number of samples between successive frames e.g. columns if a spectrogram. f_max = sr / 2. # Maximum frequency (Nyquist rate). f_min = 64. # Minimum frequency. n_fft = 1024 # Number of samples in a frame. n_mels = 80 # Number of Mel bins to generate. n_mfcc = 13 # Number of Mel cepstral coefficients to extract. win_length = 333 # Window length. # Create info string. num_samples = y.shape[0] duration = librosa.get_duration(y=y, sr=sr) info_str_format = 'Label: {}\nPath: {}\nDuration={:.3f}s with {:,d} Samples\n' \ 'Sampling Rate={:,d} Hz\nMin, Max=[{:.2f}, {:.2f}]' info_str = info_str_format.format(label, file_path, duration, num_samples, sr, np.min(y), np.max(y)) print(info_str) # Escape some LaTeX special characters info_str_tex = info_str.replace('_', '\\_') plt.figure(figsize=(10, 7)) plt.subplot(3, 1, 1) display.waveplot(y, sr=sr) plt.title('Monophonic') # Plot waveforms. y_harm, y_perc = librosa.effects.hpss(y) plt.subplot(3, 1, 2) display.waveplot(y_harm, sr=sr, alpha=0.33) display.waveplot(y_perc, sr=sr, color='r', alpha=0.40) plt.title('Harmonic and Percussive') # Add file information. plt.subplot(3, 1, 3) plt.axis('off') plt.text(0.0, 1.0, info_str_tex, color='black', verticalalignment='top') plt.tight_layout() # Calculating MEL spectrogram and MFCC. db_pow = np.abs( librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length))**2 s_mel = librosa.feature.melspectrogram(S=db_pow, sr=sr, hop_length=hop_length, fmax=f_max, fmin=f_min, n_mels=n_mels) s_mel = librosa.power_to_db(s_mel, ref=np.max) s_mfcc = librosa.feature.mfcc(S=s_mel, sr=sr, n_mfcc=n_mfcc) # STFT (Short-time Fourier Transform) # https://librosa.github.io/librosa/generated/librosa.core.stft.html plt.figure(figsize=(12, 10)) db = librosa.amplitude_to_db(librosa.magphase(librosa.stft(y))[0], ref=np.max) plt.subplot(3, 2, 1) display.specshow(db, sr=sr, x_axis='time', y_axis='linear', hop_length=hop_length) plt.colorbar(format='%+2.0f dB') plt.title('Linear-frequency power spectrogram') plt.subplot(3, 2, 2) display.specshow(db, sr=sr, x_axis='time', y_axis='log', hop_length=hop_length) plt.colorbar(format='%+2.0f dB') plt.title('Log-frequency power spectrogram') plt.subplot(3, 2, 3) display.specshow(s_mfcc, sr=sr, x_axis='time', y_axis='linear', hop_length=hop_length) plt.colorbar(format='%+2.0f dB') plt.title('MFCC spectrogram') # # CQT (Constant-T Transform) # # https://librosa.github.io/librosa/generated/librosa.core.cqt.html cqt = librosa.amplitude_to_db(librosa.magphase(librosa.cqt(y, sr=sr))[0], ref=np.max) # plt.subplot(3, 2, 3) # display.specshow(cqt, sr=sr, x_axis='time', y_axis='cqt_note', hop_length=hop_length) # plt.colorbar(format='%+2.0f dB') # plt.title('Constant-Q power spectrogram (note)') plt.subplot(3, 2, 4) display.specshow(cqt, sr=sr, x_axis='time', y_axis='cqt_hz', hop_length=hop_length) plt.colorbar(format='%+2.0f dB') plt.title('Constant-Q power spectrogram (Hz)') plt.subplot(3, 2, 5) display.specshow(db, sr=sr, x_axis='time', y_axis='log', hop_length=hop_length) plt.colorbar(format='%+2.0f dB') plt.title('Log power spectrogram') plt.subplot(3, 2, 6) display.specshow(s_mel, x_axis='time', y_axis='mel', hop_length=hop_length) plt.colorbar(format='%+2.0f dB') plt.title('Mel spectrogram') # TODO Import project used features (python_speech_features). # norm_features = 'none' # mfcc = load_sample(file_path, feature_type='mfcc', feature_normalization=norm_features)[0] # mfcc = np.swapaxes(mfcc, 0, 1) # # mel = load_sample(file_path, feature_type='mel', feature_normalization=norm_features)[0] # mel = np.swapaxes(mel, 0, 1) (__sr, __y) = wavfile.read(file_path) num_features = 26 win_len = WIN_LENGTH win_step = WIN_STEP __mel = psf.logfbank(signal=__y, samplerate=__sr, winlen=win_len, winstep=win_step, nfilt=num_features, nfft=n_fft, lowfreq=f_min, highfreq=f_max, preemph=0.97) __mfcc = psf.mfcc(signal=__y, samplerate=__sr, winlen=win_len, winstep=win_step, numcep=num_features // 2, nfilt=num_features, nfft=n_fft, lowfreq=f_min, highfreq=f_max, preemph=0.97, ceplifter=22, appendEnergy=False) __mfcc = __mfcc.astype(np.float32) __mel = __mel.astype(np.float32) __mfcc = np.swapaxes(__mfcc, 0, 1) __mel = np.swapaxes(__mel, 0, 1) plt.figure(figsize=(5.2, 1.6)) display.waveplot(y, sr=sr) fig = plt.figure(figsize=(10, 4)) plt.subplot(2, 1, 2) display.specshow(__mfcc, sr=__sr, x_axis='time', y_axis='mel', hop_length=win_step * __sr) # plt.set_cmap('magma') # plt.xticks(rotation=295) plt.xlabel('Time (s)') plt.ylabel('Frequency (Hz)') plt.xlim(xmin=0) plt.ylim(0, 8000) plt.colorbar(format='%+2.0f') plt.title('MFCC', visible=False) plt.subplot(2, 1, 1) display.specshow(__mel, sr=__sr, x_axis='time', y_axis='mel', hop_length=win_step * __sr) # plt.set_cmap('magma') # plt.xticks(rotation=295) plt.xlabel('Time (s)') plt.ylabel('Frequency (Hz)') plt.xlim(xmin=0) plt.ylim(0, 8000) plt.colorbar(format='%+2.0f', label='Power (dB)') plt.title('Mel Spectrogram', visible=False) plt.tight_layout() fig.savefig('/tmp/mel-mfcc-plot-we-did-it.pdf', bbox_inches='tight') plt.show()
def hpss_wav(y): H, P = librosa.decompose.hpss(librosa.stft(y)) return librosa.istft(H), librosa.istft(P)
def gen_audio_features(item, config): """Generate audio features and transformations Args: item (Dict): dictionary containing the attributes to encode. config (Dict): configuration dictionary. Returns: (bool): keep this sample or not. mel (ndarray): mel matrix in np.float32. energy (ndarray): energy audio profile. f0 (ndarray): fundamental frequency. item (Dict): dictionary containing the updated attributes. """ # get info from sample. audio = item["audio"] utt_id = item["utt_id"] rate = item["rate"] # check audio properties assert len(audio.shape) == 1, f"{utt_id} seems to be multi-channel signal." assert np.abs( audio).max() <= 1.0, f"{utt_id} is different from 16 bit PCM." # check sample rate if rate != config["sampling_rate"]: audio = librosa.resample(audio, rate, config["sampling_rate"]) logging.info( f"{utt_id} sampling rate is {rate}, not {config['sampling_rate']}, we resample it." ) # trim silence if config["trim_silence"]: if "trim_mfa" in config and config["trim_mfa"]: _, item["text_ids"], audio = ph_based_trim( config, utt_id, item["text_ids"], item["raw_text"], audio, config["hop_size"], ) if ( audio.__len__() < 1 ): # very short files can get trimmed fully if mfa didnt extract any tokens LibriTTS maybe take only longer files? logging.warning( f"File have only silence or MFA didnt extract any token {utt_id}" ) return False, None, None, None, item else: audio, _ = librosa.effects.trim( audio, top_db=config["trim_threshold_in_db"], frame_length=config["trim_frame_size"], hop_length=config["trim_hop_size"], ) # resample audio if necessary if "sampling_rate_for_feats" in config: audio = librosa.resample(audio, rate, config["sampling_rate_for_feats"]) sampling_rate = config["sampling_rate_for_feats"] assert ( config["hop_size"] * config["sampling_rate_for_feats"] % rate == 0 ), "'hop_size' must be 'int' value. Please check if 'sampling_rate_for_feats' is correct." hop_size = config["hop_size"] * config[ "sampling_rate_for_feats"] // rate else: sampling_rate = config["sampling_rate"] hop_size = config["hop_size"] # get spectrogram D = librosa.stft( audio, n_fft=config["fft_size"], hop_length=hop_size, win_length=config["win_length"], window=config["window"], pad_mode="reflect", ) S, _ = librosa.magphase(D) # (#bins, #frames) # get mel basis fmin = 0 if config["fmin"] is None else config["fmin"] fmax = sampling_rate // 2 if config["fmax"] is None else config["fmax"] mel_basis = librosa.filters.mel( sr=sampling_rate, n_fft=config["fft_size"], n_mels=config["num_mels"], fmin=fmin, fmax=fmax, ) mel = np.log10(np.maximum(np.dot(mel_basis, S), 1e-10)).T # (#frames, #bins) mel_eos = np.zeros(shape=[1, np.shape(mel)[1] ]) # (1, #bins) # represent mel for eos_token. mel = np.concatenate([mel, mel_eos], axis=0) # (#frames + 1, #bins) # check audio and feature length audio_eos = np.zeros( shape=[hop_size]) # (hop_size) # represent audio for eos_token. audio = np.concatenate([audio, audio_eos], axis=-1) audio = np.pad(audio, (0, config["fft_size"]), mode="edge") audio = audio[:len(mel) * hop_size] assert len(mel) * hop_size == len( audio), f"{len(mel) * hope_size}, {len(audio)}" # extract raw pitch _f0, t = pw.dio( audio.astype(np.double), fs=sampling_rate, f0_ceil=fmax, frame_period=1000 * hop_size / sampling_rate, ) f0 = pw.stonemask(audio.astype(np.double), _f0, t, sampling_rate) if len(f0) >= len(mel): f0 = f0[:len(mel)] else: f0 = np.pad(f0, (0, len(mel) - len(f0))) # extract energy energy = np.sqrt(np.sum(S**2, axis=0)) energy = np.concatenate([energy, [0]], axis=-1) # # represent energy for eos_token. assert len(mel) == len(f0) == len( energy), f"{len(mel)}, {len(f0)}, {len(energy)}" # apply global gain if config["global_gain_scale"] > 0.0: audio *= config["global_gain_scale"] if np.abs(audio).max() >= 1.0: logging.warn( f"{utt_id} causes clipping. It is better to reconsider global gain scale value." ) item["audio"] = audio item["mel"] = mel item["f0"] = remove_outlier(f0) item["energy"] = remove_outlier(energy) return True, mel, energy, f0, item
t=np.linspace(0,N/fe,N); s = 0.2*np.cos(2*np.pi*200*t) + 2*np.cos(2*np.pi*400*t); tf=np.linspace(0,fe/N,N); plt.subplot(1,2,1); plt.plot(t[:200],s[:200]); plt.title('280Hz et 500Hz,fe=8000Hz') plt.subplot(1,2,2); plt.plot(np.abs(np.fft.fft(s))); plt.title('280Hz et 500Hz,fe=8000Hz') """ #x, fe = librosa.load('ressources/mesange-tete-noire.wav') x, fe = librosa.load('ressources/PIANO.wav') plt.figure(figsize=(14, 5)) librosa.display.waveplot(x, sr=fe) plt.title('') plt.show() fe /= 2 n = len(x) t = np.linspace(0, n / fe, n, endpoint=False) s = 0.75 * np.cos(2 * np.pi * 440 * t) plt.plot(t, x) plt.plot(np.abs(np.fft.fft(s))) Sdb = librosa.amplitude_to_db(abs(s)) S = np.abs(librosa.stft(s)) Sdb = librosa.amplitude_to_db(abs(S)) #librosa.display.specshow(Sdb, sr=fe, x_axis='time', y_axis='hz') #librosa.display.specshow(Sdb, sr=fe, x_axis='time', y_axis='hz') sd.play(x, fe) status = sd.wait()
def calculate_melsp(x, n_fft=1024, hop_length=128): stft = np.abs(librosa.stft(x, n_fft=n_fft, hop_length=hop_length))**2 log_stft = librosa.power_to_db(stft) melsp = librosa.feature.melspectrogram(S=log_stft, n_mels=128) return melsp
def decompose_audio(y): harmonic, percussive = librosa.decompose.hpss(librosa.stft(y), margin=2) harmonic = librosa.istft(harmonic) percussive = librosa.istft(percussive) return harmonic, percussive
import ntpath root = os.path.dirname(os.path.realpath(__file__)) path_name = r'test' direc_name = os.path.join(root, path_name) train_path = r'test/audio' csv_file = os.path.join(direc_name, 'features_test.csv') folders = os.listdir(path_name) with open(csv_file, "w", newline='') as output: audio_class_folder = os.path.join(root, train_path) files = os.listdir(audio_class_folder) for file in files: print(file) X, samp_rate = librosa.load(os.path.join(audio_class_folder, file)) stft = np.array(np.abs(librosa.stft(X))) mfcc = np.array( np.mean(librosa.feature.mfcc(y=X, sr=samp_rate, n_mfcc=40).T, axis=0)) chroma = np.array( np.mean(librosa.feature.chroma_stft(S=stft, sr=samp_rate).T, axis=0)) contrast = np.array( np.mean(librosa.feature.spectral_contrast(S=stft, sr=samp_rate).T, axis=0)) features = np.append(mfcc, chroma) features = np.append(features, contrast) features_full = features.tolist() writer = csv.writer(output, delimiter=',') writer.writerow(features_full) print('Yay')