def note_specgram(path, ax, peak=70.0, use_cqt=True): # Add several samples together if isinstance(path, list): for i, p in enumerate(path): sr, a = readwav(f) audio = a if i == 0 else a + audio # Load one sample else: sr, audio = readwav(f) audio = audio.astype(np.float32) if use_cqt: C = librosa.cqt(audio, sr=sr, hop_length=hop_length, bins_per_octave=int(notes_per_octave * over_sample), n_bins=int(octaves * notes_per_octave * over_sample), real=False, filter_scale=res_factor, fmin=librosa.note_to_hz('C2')) else: C = librosa.stft(audio, n_fft=n_fft, win_length=n_fft, hop_length=hop_length, center=True) mag, phase = librosa.core.magphase(C) phase_angle = np.angle(phase) phase_unwrapped = np.unwrap(phase_angle) dphase = phase_unwrapped[:, 1:] - phase_unwrapped[:, :-1] dphase = np.concatenate([phase_unwrapped[:, 0:1], dphase], axis=1) / np.pi mag = (librosa.logamplitude( mag**2, amin=1e-13, top_db=peak, ref_power=np.max) / peak) + 1 ax.matshow(dphase[::-1, :], cmap=plt.cm.rainbow) ax.matshow(mag[::-1, :], cmap=my_mask)
def read_audio(path): """ :param path: Can be a list, or a single string :return: sr, audio """ # Add several samples together if isinstance(path, list): for i, p in enumerate(path): sr, a = readwav(path) audio = a if i == 0 else a + audio # Load one sample else: sr, audio = readwav(path) audio = audio.astype(np.float32) return sr, audio
def make_step(self, size_in, size_out, dt, rng): assert size_in[0] == 0 assert size_out[0] == 1 rate = 1. / dt orig_rate, orig = readwav(self.path) new_size = int(orig.size * (rate / orig_rate)) wave = resample(orig, new_size) wave -= wave.mean() # Normalize wave to desired rms wave_rms = npext.rms(wave) wave *= (self.rms / wave_rms) if self.at_end == 'loop': def step_wavfileloop(t): idx = int(t * rate) % wave.size return wave[idx] return step_wavfileloop elif self.at_end == 'stop': def step_wavfilestop(t): idx = int(t * rate) if idx > wave.size: return 0. else: return wave[idx] return step_wavfilestop
def main(): import argparse from scipy.io.wavfile import read as readwav import numpy as np parser = argparse.ArgumentParser(description='Compute the SRMR metric for a given WAV file') parser.add_argument('-f', '--fast', dest='fast', action='store_true', default=False, help='Use the faster version based on the gammatonegram') parser.add_argument('-n', '--norm', dest='norm', action='store_true', default=False, help='Use modulation spectrum energy normalization') parser.add_argument('--ncochlearfilters', dest='n_cochlear_filters', type=int, default=23, help='Number of filters in the acoustic filterbank') parser.add_argument('--mincf', dest='min_cf', type=float, default=4.0, help='Center frequency of the first modulation filter') parser.add_argument('--maxcf', dest='max_cf', type=float, default=128.0, help='Center frequency of the last modulation filter') parser.add_argument('path', metavar='path', nargs='+', help='Path of the file or files to be processed. Can also be a folder.') args = parser.parse_args() for f in args.path: fs, s = readwav(f) if np.issubdtype(s.dtype, np.int): s = s.astype('float')/np.iinfo(s.dtype).max srmr = SRMR(fs, n_cochlear_filters=args.n_cochlear_filters, min_cf=args.min_cf, max_cf=args.max_cf, fast=args.fast, norm=args.norm) out = srmr.predict(s, s, s) r, energy = out['p']['srmr'], out['avg_energy'] print('%s, %f' % (f, r))
def load_wavs_from_dir(path): fnames = map(lambda f: f.path, filter(lambda f: f.is_file(), os.scandir(path))) fnames, rates, wavs = map( np.asarray, zip(*tuple(map(lambda f: (f, *readwav(f)), fnames)))) wrong_rates = (rates != sample_rate) if wrong_rates.any(): print('Following files have wrong simple rate (!= {}):'.format( sample_rate) + '\n\t' + '\n\t'.join( map(lambda t: '"{}" | rate = {}'.format(*t), np.vstack((fnames, rates))[:, wrong_rates].transpose())), file=stderr) print('\nSkipping them') wavs = wavs[~wrong_rates] maxlen = max(i.shape[0] for i in wavs) ar = [] for i in wavs: s = i[:] if len(i.shape) == 1 else i[:, 0] # Use single channel s = s.copy() s.resize((maxlen, )) ar.append(s) data = np.vstack(ar) return data
def readwav(filename): """Read a WAV file and returns the data and sample rate :: from spectrum.io import readwav readwav() """ from scipy.io.wavfile import read as readwav samplerate, signal = readwav(filename) return signal, samplerate
def process_file(f, args): fs, s = readwav(f) if len(s.shape) > 1: s = s[:,0] if np.issubdtype(s.dtype, np.int): s = s.astype('float')/np.iinfo(s.dtype).max r, energy = srmr(s, fs, n_cochlear_filters=args.n_cochlear_filters, min_cf=args.min_cf, max_cf=args.max_cf, fast=args.fast, norm=args.norm) return f, r
def cqt(fname, dst): sr, audio = readwav(fname) CQT = librosa.amplitude_to_db(np.abs( librosa.cqt(audio.astype(np.float32), sr=sr)), ref=np.max) fig, ax = plt.subplots(figsize=(6, 6)) d = librosa.display.specshow(CQT, y_axis='cqt_note', ax=ax) fig.colorbar(d, ax=ax, format='%+2.0f dB') ax.set_title('Constant-Q power spectrogram (note)') out_path = dst / os.path.basename(fname).replace('.wav', '_cqt.jpg') plt.savefig(out_path, dpi=100)
def chroma(fname, dst): sr, audio = readwav(fname) C = librosa.feature.chroma_cqt(y=audio.astype(np.float32), sr=sr) fig, ax = plt.subplots(figsize=(6, 6)) d = librosa.display.specshow(C, y_axis='chroma') fig.colorbar(d, ax=ax) ax.set_title('Chromagram') if not os.path.exists(dst): os.makedirs(dst) out_path = dst / os.path.basename(fname).replace('.wav', '_chroma.jpg') plt.savefig(out_path, dpi=100)
def rainbow(fname, dst, peak=70.0, use_cqt=True): # Constants n_fft = 512 hop_length = 256 over_sample = 4 res_factor = 0.8 octaves = 6 notes_per_octave = 10 # Plotting functions cdict = { 'red': ((0.0, 0.0, 0.0), (1.0, 0.0, 0.0)), 'green': ((0.0, 0.0, 0.0), (1.0, 0.0, 0.0)), 'blue': ((0.0, 0.0, 0.0), (1.0, 0.0, 0.0)), 'alpha': ((0.0, 1.0, 1.0), (1.0, 0.0, 0.0)) } my_mask = matplotlib.colors.LinearSegmentedColormap('MyMask', cdict) plt.register_cmap(cmap=my_mask) fig, ax = plt.subplots(figsize=(6, 6)) sr, audio = readwav(fname) audio = audio.astype(np.float32) if use_cqt: C = librosa.cqt(audio, sr=sr, hop_length=hop_length, bins_per_octave=int(notes_per_octave * over_sample), n_bins=int(octaves * notes_per_octave * over_sample), filter_scale=res_factor, fmin=librosa.note_to_hz('C2')) else: C = librosa.stft(audio, n_fft=n_fft, win_length=n_fft, hop_length=hop_length, center=True) mag, phase = librosa.core.magphase(C) phase_angle = np.angle(phase) phase_unwrapped = np.unwrap(phase_angle) dphase = phase_unwrapped[:, 1:] - phase_unwrapped[:, :-1] dphase = np.concatenate([phase_unwrapped[:, 0:1], dphase], axis=1) / np.pi mag = (librosa.power_to_db(mag**2, amin=1e-13, top_db=peak, ref=np.max) / peak) + 1 ax.imshow(np.flipud(dphase[::-1, :]), cmap=plt.cm.rainbow, origin='lower') ax.imshow(np.flipud(mag[::-1, :]), cmap=my_mask, origin='lower') ax.set_title(f"Rainbow Spectrogram for {os.path.basename(fname)}") out_path = dst / os.path.basename(fname).replace('.wav', '_rainbow.jpg') plt.savefig(out_path, dpi=100)
def note_specgram(path, ax, use_cqt=True): # Add several samples together if isinstance(path, list): for i, f in enumerate(path): sr, a = readwav(f) audio = a if i == 0 else a + audio # Load one sample else: sr, audio = readwav(path) audio = audio.astype(np.float32) # Constants hop_length = 256 if use_cqt: over_sample = 4 res_factor = 1.0 # 0.8 octaves = 6 notes_per_octave = 12 C = librosa.cqt(audio, sr=sr, hop_length=hop_length, bins_per_octave=int(notes_per_octave * over_sample), n_bins=int(octaves * notes_per_octave * over_sample), filter_scale=res_factor, fmin=librosa.note_to_hz('C3')) else: n_fft = 512 C = librosa.stft(audio, n_fft=n_fft, win_length=n_fft, hop_length=hop_length, center=True) plot_rainbow(ax, C)
# Split signal in frames framelen = int(framelen * fs) frames = segment_axis(x, length=framelen, overlap=0, end='pad') frames_zero_mean = frames - frames.mean(axis=0) frame_energy = 10 * np.log10(1 / (framelen - 1) * (frames_zero_mean**2).sum(axis=1) + 1e-6) max_energy = max(frame_energy) speech_presence = (frame_energy > max_energy - theta_main) & (frame_energy > theta_min) x_vad = np.zeros_like(x, dtype=bool) for idx, frame in enumerate(frames): if speech_presence[idx]: x_vad[idx * framelen:(idx + 1) * framelen] = True else: x_vad[idx * framelen:(idx + 1) * framelen] = False return x[x_vad], x_vad if __name__ == '__main__': import sys from scipy.io.wavfile import read as readwav from matplotlib import pyplot as plt fs, s = readwav(sys.argv[1]) s = s.astype('float') / np.iinfo(s.dtype).max s_vad, speech_presence = simple_energy_vad(s, fs) plt.plot(s) plt.plot(s_vad - 1, 'g') plt.show()
this is the benchmark method, not the method proposed by the authors). ''' # Split signal in frames framelen = int(framelen * fs) frames = segment_axis(x, length=framelen, overlap=0, end='pad') frames_zero_mean = frames - frames.mean(axis=0) frame_energy = 10*np.log10(1/(framelen-1) * (frames_zero_mean**2).sum(axis=1) + 1e-6) max_energy = max(frame_energy) speech_presence = (frame_energy > max_energy - theta_main) & (frame_energy > theta_min) x_vad = np.zeros_like(x, dtype=bool) for idx, frame in enumerate(frames): if speech_presence[idx]: x_vad[idx*framelen:(idx+1)*framelen] = True else: x_vad[idx*framelen:(idx+1)*framelen] = False return x[x_vad], x_vad if __name__ == '__main__': import sys from scipy.io.wavfile import read as readwav from matplotlib import pyplot as plt fs, s = readwav(sys.argv[1]) s = s.astype('float')/np.iinfo(s.dtype).max s_vad, speech_presence = simple_energy_vad(s, fs) plt.plot(s) plt.plot(s_vad - 1, 'g') plt.show()