def test_reconstruct_sound(): fs, x = audio.read_wav(sound_path("sax-phrase-short.wav")) window_size, fft_size, hop_size = 4001, 4096, 2048 window = get_window('hamming', window_size) mag_spectrogram, phase_spectrogram = stft.from_audio( x, window, fft_size, hop_size) x_reconstructed = stft.to_audio(mag_spectrogram, phase_spectrogram, window_size, hop_size) assert 138746 == len(x) expected_frame_count = int(math.ceil(float(len(x)) / hop_size)) assert expected_frame_count == len(mag_spectrogram) assert expected_frame_count == len(phase_spectrogram) # statistics of the spectrogram for regression testing without explicitly storing the whole data assert np.allclose(-102.86187076588583, np.mean(mag_spectrogram)) assert np.allclose(11.368333745102881, np.mean(phase_spectrogram)) # TODO: should be the same as len(x) assert expected_frame_count * hop_size == len(x_reconstructed) assert np.allclose(0.0014030089623073237, rmse(x, x_reconstructed[:len(x)]))
salience_peaks_bins, salience_peaks_saliences = run_pitch_salience_function_peaks(salience) pool.add('allframes_salience_peaks_bins', salience_peaks_bins) pool.add('allframes_salience_peaks_saliences', salience_peaks_saliences) contours_bins, contours_saliences, contours_start_times, duration = run_pitch_contours( pool['allframes_salience_peaks_bins'], pool['allframes_salience_peaks_saliences']) pitch, confidence = run_pitch_contours_melody(contours_bins, contours_saliences, contours_start_times, duration) figure(1, figsize=(9, 6)) mX, pX = stft.from_audio(audio, hamming(frameSize), frameSize, hopSize) maxplotfreq = 3000.0 numFrames = int(mX.shape[0]) frmTime = hopSize * arange(numFrames) / float(sampleRate) binFreq = sampleRate * arange(frameSize * maxplotfreq / sampleRate) / frameSize plt.pcolormesh(frmTime, binFreq, np.transpose(mX[:, :frameSize * maxplotfreq / sampleRate + 1])) plt.autoscale(tight=True) offset = .5 * frameSize / sampleRate for i in range(len(contours_bins)): time = contours_start_times[i] - offset + hopSize * arange(size(contours_bins[i])) / float(sampleRate) contours_freq = 55.0 * pow(2, array(contours_bins[i]) * 10 / 1200.0) plot(time, contours_freq, color='k', linewidth=2) plt.title('mX + F0 trajectories (carnatic.wav)') tight_layout()
from smst.utils import audio from smst.models import sine, stft plt.figure(1, figsize=(9, 7)) plt.subplot(211) (fs, x) = audio.read_wav('../../../sounds/vibraphone-C6.wav') w = np.blackman(401) N = 512 H = 100 t = -100 minSineDur = .02 maxnSines = 150 freqDevOffset = 20 freqDevSlope = 0.01 mX, pX = stft.from_audio(x, w, N, H) tfreq, tmag, tphase = sine.from_audio(x, fs, w, N, H, t, maxnSines, minSineDur, freqDevOffset, freqDevSlope) maxplotfreq = 10000.0 maxplotbin = int(N * maxplotfreq / fs) numFrames = int(mX.shape[0]) frmTime = H * np.arange(numFrames) / float(fs) binFreq = np.arange(maxplotbin + 1) * float(fs) / N plt.pcolormesh(frmTime, binFreq, np.transpose(mX[:, :maxplotbin + 1])) plt.autoscale(tight=True) tracks = tfreq * np.less(tfreq, maxplotfreq) tracks[tracks <= 0] = np.nan plt.plot(frmTime, tracks, color='k', lw=1.5) plt.autoscale(tight=True)
def main(inputFile1=demo_sound_path('ocean.wav'), inputFile2=demo_sound_path('speech-male.wav'), window1='hamming', window2='hamming', M1=1024, M2=1024, N1=1024, N2=1024, H1=256, smoothf=.5, balancef=0.2, interactive=True, plotFile=False): """ Function to perform a morph between two sounds inputFile1: name of input sound file to be used as source inputFile2: name of input sound file to be used as filter window1 and window2: windows for both files M1 and M2: window sizes for both files N1 and N2: fft sizes for both sounds H1: hop size for sound 1 (the one for sound 2 is computed automatically) smoothf: smoothing factor to be applyed to magnitude spectrum of sound 2 before morphing balancef: balance factor between booth sounds, 0 is sound 1 and 1 is sound 2 """ # read input sounds (fs, x1) = audio.read_wav(inputFile1) (fs, x2) = audio.read_wav(inputFile2) # compute analysis windows w1 = get_window(window1, M1) w2 = get_window(window2, M2) # perform morphing y = stft.morph(x1, x2, fs, w1, N1, w2, N2, H1, smoothf, balancef) # compute the magnitude and phase spectrogram of input sound (for plotting) mX1, pX1 = stft.from_audio(x1, w1, N1, H1) # compute the magnitude and phase spectrogram of output sound (for plotting) mY, pY = stft.from_audio(y, w1, N1, H1) # write output sound outputFile = 'output_sounds/' + os.path.basename( inputFile1)[:-4] + '_stftMorph.wav' audio.write_wav(y, fs, outputFile) # create figure to plot plt.figure(figsize=(12, 9)) # frequency range to plot maxplotfreq = 10000.0 # plot sound 1 plt.subplot(4, 1, 1) plt.plot(np.arange(x1.size) / float(fs), x1) plt.axis([0, x1.size / float(fs), min(x1), max(x1)]) plt.ylabel('amplitude') plt.xlabel('time (sec)') plt.title('input sound: x') # plot magnitude spectrogram of sound 1 plt.subplot(4, 1, 2) numFrames = int(mX1.shape[0]) frmTime = H1 * np.arange(numFrames) / float(fs) binFreq = fs * np.arange(N1 * maxplotfreq / fs) / N1 plt.pcolormesh(frmTime, binFreq, np.transpose(mX1[:, :N1 * maxplotfreq / fs + 1])) plt.xlabel('time (sec)') plt.ylabel('frequency (Hz)') plt.title('magnitude spectrogram of x') plt.autoscale(tight=True) # plot magnitude spectrogram of morphed sound plt.subplot(4, 1, 3) numFrames = int(mY.shape[0]) frmTime = H1 * np.arange(numFrames) / float(fs) binFreq = fs * np.arange(N1 * maxplotfreq / fs) / N1 plt.pcolormesh(frmTime, binFreq, np.transpose(mY[:, :N1 * maxplotfreq / fs + 1])) plt.xlabel('time (sec)') plt.ylabel('frequency (Hz)') plt.title('magnitude spectrogram of y') plt.autoscale(tight=True) # plot the morphed sound plt.subplot(4, 1, 4) plt.plot(np.arange(y.size) / float(fs), y) plt.axis([0, y.size / float(fs), min(y), max(y)]) plt.ylabel('amplitude') plt.xlabel('time (sec)') plt.title('output sound: y') plt.tight_layout() if interactive: plt.show() if plotFile: plt.savefig( 'output_plots/%s_%s_stft_morph.png' % (files.strip_file(inputFile1), files.strip_file(inputFile2)))
def main(inputFile=demo_sound_path('sax-phrase-short.wav'), window='blackman', M=601, N=1024, t=-100, minSineDur=0.1, nH=100, minf0=350, maxf0=700, f0et=5, harmDevSlope=0.01, interactive=True, plotFile=False): """ Perform analysis/synthesis using the harmonic plus residual model inputFile: input sound file (monophonic with sampling rate of 44100) window: analysis window type (rectangular, hanning, hamming, blackman, blackmanharris) M: analysis window size; N: fft size (power of two, bigger or equal than M) t: magnitude threshold of spectral peaks; minSineDur: minimum duration of sinusoidal tracks nH: maximum number of harmonics; minf0: minimum fundamental frequency in sound maxf0: maximum fundamental frequency in sound; f0et: maximum error accepted in f0 detection algorithm harmDevSlope: allowed deviation of harmonic tracks, higher harmonics have higher allowed deviation """ # size of fft used in synthesis Ns = 512 # hop size (has to be 1/4 of Ns) H = 128 # read input sound (fs, x) = audio.read_wav(inputFile) # compute analysis window w = get_window(window, M) # find harmonics and residual hfreq, hmag, hphase, xr = hpr.from_audio(x, fs, w, N, H, t, minSineDur, nH, minf0, maxf0, f0et, harmDevSlope) # compute spectrogram of residual mXr, pXr = stft.from_audio(xr, w, N, H) # synthesize hpr model y, yh = hpr.to_audio(hfreq, hmag, hphase, xr, Ns, H, fs) # output sound file (monophonic with sampling rate of 44100) baseFileName = files.strip_file(inputFile) outputFileSines, outputFileResidual, outputFile = [ 'output_sounds/%s_hprModel%s.wav' % (baseFileName, i) for i in ('_sines', '_residual', '') ] # write sounds files for harmonics, residual, and the sum audio.write_wav(yh, fs, outputFileSines) audio.write_wav(xr, fs, outputFileResidual) audio.write_wav(y, fs, outputFile) # create figure to plot plt.figure(figsize=(12, 9)) # frequency range to plot maxplotfreq = 5000.0 # plot the input sound plt.subplot(3, 1, 1) plt.plot(np.arange(x.size) / float(fs), x) plt.axis([0, x.size / float(fs), min(x), max(x)]) plt.ylabel('amplitude') plt.xlabel('time (sec)') plt.title('input sound: x') # plot the magnitude spectrogram of residual plt.subplot(3, 1, 2) maxplotbin = int(N * maxplotfreq / fs) numFrames = int(mXr.shape[0]) frmTime = H * np.arange(numFrames) / float(fs) binFreq = np.arange(maxplotbin + 1) * float(fs) / N plt.pcolormesh(frmTime, binFreq, np.transpose(mXr[:, :maxplotbin + 1])) plt.autoscale(tight=True) # plot harmonic frequencies on residual spectrogram if (hfreq.shape[1] > 0): harms = hfreq * np.less(hfreq, maxplotfreq) harms[harms == 0] = np.nan numFrames = int(harms.shape[0]) frmTime = H * np.arange(numFrames) / float(fs) plt.plot(frmTime, harms, color='k', ms=3, alpha=1) plt.xlabel('time(s)') plt.ylabel('frequency(Hz)') plt.autoscale(tight=True) plt.title('harmonics + residual spectrogram') # plot the output sound plt.subplot(3, 1, 3) plt.plot(np.arange(y.size) / float(fs), y) plt.axis([0, y.size / float(fs), min(y), max(y)]) plt.ylabel('amplitude') plt.xlabel('time (sec)') plt.title('output sound: y') plt.tight_layout() if interactive: plt.show() if plotFile: plt.savefig('output_plots/%s_hpr_model.png' % files.strip_file(inputFile))
mpl.use('Agg') import matplotlib.pyplot as plt import numpy as np from smst.utils import audio from smst.models import stft (fs, x) = audio.read_wav('../../../sounds/piano.wav') plt.figure(1, figsize=(9.5, 6)) w = np.hamming(256) N = 256 H = 128 mX1, pX1 = stft.from_audio(x, w, N, H) plt.subplot(211) numFrames = int(mX1.shape[0]) frmTime = H * np.arange(numFrames) / float(fs) binFreq = np.arange(mX1.shape[1]) * float(fs) / N plt.pcolormesh(frmTime, binFreq, np.transpose(mX1)) plt.title('mX (piano.wav), M=256, N=256, H=128') plt.autoscale(tight=True) w = np.hamming(1024) N = 1024 H = 128 mX2, pX2 = stft.from_audio(x, w, N, H) plt.subplot(212) numFrames = int(mX2.shape[0])
def main(inputFile=demo_sound_path('bendir.wav'), window='hamming', M=2001, N=2048, t=-80, minSineDur=0.02, maxnSines=150, freqDevOffset=10, freqDevSlope=0.001, interactive=True, plotFile=False): """ inputFile: input sound file (monophonic with sampling rate of 44100) window: analysis window type (rectangular, hanning, hamming, blackman, blackmanharris) M: analysis window size N: fft size (power of two, bigger or equal than M) t: magnitude threshold of spectral peaks minSineDur: minimum duration of sinusoidal tracks maxnSines: maximum number of parallel sinusoids freqDevOffset: frequency deviation allowed in the sinusoids from frame to frame at frequency 0 freqDevSlope: slope of the frequency deviation, higher frequencies have bigger deviation """ # size of fft used in synthesis Ns = 512 # hop size (has to be 1/4 of Ns) H = 128 # read input sound (fs, x) = audio.read_wav(inputFile) # compute analysis window w = get_window(window, M) # perform sinusoidal plus residual analysis tfreq, tmag, tphase, xr = spr.from_audio(x, fs, w, N, H, t, minSineDur, maxnSines, freqDevOffset, freqDevSlope) # compute spectrogram of residual mXr, pXr = stft.from_audio(xr, w, N, H) # sum sinusoids and residual y, ys = spr.to_audio(tfreq, tmag, tphase, xr, Ns, H, fs) # output sound file (monophonic with sampling rate of 44100) baseFileName = strip_file(inputFile) outputFileSines, outputFileResidual, outputFile = [ 'output_sounds/%s_sprModel%s.wav' % (baseFileName, i) for i in ('_sines', '_residual', '') ] # write sounds files for sinusoidal, residual, and the sum audio.write_wav(ys, fs, outputFileSines) audio.write_wav(xr, fs, outputFileResidual) audio.write_wav(y, fs, outputFile) # create figure to show plots plt.figure(figsize=(12, 9)) # frequency range to plot maxplotfreq = 5000.0 # plot the input sound plt.subplot(3, 1, 1) plt.plot(np.arange(x.size) / float(fs), x) plt.axis([0, x.size / float(fs), min(x), max(x)]) plt.ylabel('amplitude') plt.xlabel('time (sec)') plt.title('input sound: x') # plot the magnitude spectrogram of residual plt.subplot(3, 1, 2) maxplotbin = int(N * maxplotfreq / fs) numFrames = int(mXr.shape[0]) frmTime = H * np.arange(numFrames) / float(fs) binFreq = np.arange(maxplotbin + 1) * float(fs) / N plt.pcolormesh(frmTime, binFreq, np.transpose(mXr[:, :maxplotbin + 1])) plt.autoscale(tight=True) # plot the sinusoidal frequencies on top of the residual spectrogram if (tfreq.shape[1] > 0): tracks = tfreq * np.less(tfreq, maxplotfreq) tracks[tracks <= 0] = np.nan plt.plot(frmTime, tracks, color='k') plt.title('sinusoidal tracks + residual spectrogram') plt.autoscale(tight=True) # plot the output sound plt.subplot(3, 1, 3) plt.plot(np.arange(y.size) / float(fs), y) plt.axis([0, y.size / float(fs), min(y), max(y)]) plt.ylabel('amplitude') plt.xlabel('time (sec)') plt.title('output sound: y') plt.tight_layout() if interactive: plt.show() if plotFile: plt.savefig('output_plots/%s_spr_model.png' % files.strip_file(inputFile))
import numpy as np from smst.utils import audio from smst.models import sine, stft (fs, x) = audio.read_wav('../../../sounds/mridangam.wav') w = np.hamming(801) N = 2048 t = -90 minSineDur = .005 maxnSines = 150 freqDevOffset = 20 freqDevSlope = 0.02 Ns = 512 H = Ns / 4 mX, pX = stft.from_audio(x, w, N, H) tfreq, tmag, tphase = sine.from_audio(x, fs, w, N, H, t, maxnSines, minSineDur, freqDevOffset, freqDevSlope) timeScale = np.array( [.01, .0, .03, .03, .335, .4, .355, .42, .671, .8, .691, .82, .858, 1.2, .878, 1.22, 1.185, 1.6, 1.205, 1.62, 1.497, 2.0, 1.517, 2.02, 1.686, 2.4, 1.706, 2.42, 1.978, 2.8]) ytfreq, ytmag = sine.scale_time(tfreq, tmag, timeScale) y = sine.to_audio(ytfreq, ytmag, np.array([]), Ns, H, fs) mY, pY = stft.from_audio(y, w, N, H) plt.figure(1, figsize=(12, 9)) maxplotfreq = 4000.0 plt.subplot(4, 1, 1) plt.plot(np.arange(x.size) / float(fs), x, 'b') plt.axis([0, x.size / float(fs), min(x), max(x)]) plt.title('x (mridangam.wav)')
def main(inputFile=demo_sound_path('piano.wav'), window='hamming', M=1024, N=1024, H=512, interactive=True, plotFile=False): """ analysis/synthesis using the STFT inputFile: input sound file (monophonic with sampling rate of 44100) window: analysis window type (choice of rectangular, hanning, hamming, blackman, blackmanharris) M: analysis window size N: fft size (power of two, bigger or equal than M) H: hop size (at least 1/2 of analysis window size to have good overlap-add) """ # read input sound (monophonic with sampling rate of 44100) fs, x = audio.read_wav(inputFile) # compute analysis window w = get_window(window, M) # compute the magnitude and phase spectrogram mX, pX = stft.from_audio(x, w, N, H) # perform the inverse stft y = stft.to_audio(mX, pX, M, H) # output sound file (monophonic with sampling rate of 44100) outputFile = 'output_sounds/' + strip_file(inputFile) + '_stft.wav' # write the sound resulting from the inverse stft audio.write_wav(y, fs, outputFile) # create figure to plot plt.figure(figsize=(12, 9)) # frequency range to plot maxplotfreq = 5000.0 # plot the input sound plt.subplot(4, 1, 1) plt.plot(np.arange(x.size) / float(fs), x) plt.axis([0, x.size / float(fs), min(x), max(x)]) plt.ylabel('amplitude') plt.xlabel('time (sec)') plt.title('input sound: x') # plot magnitude spectrogram plt.subplot(4, 1, 2) numFrames = int(mX.shape[0]) frmTime = H * np.arange(numFrames) / float(fs) binFreq = fs * np.arange(N * maxplotfreq / fs) / N plt.pcolormesh(frmTime, binFreq, np.transpose(mX[:, :N * maxplotfreq / fs + 1])) plt.xlabel('time (sec)') plt.ylabel('frequency (Hz)') plt.title('magnitude spectrogram') plt.autoscale(tight=True) # plot the phase spectrogram plt.subplot(4, 1, 3) numFrames = int(pX.shape[0]) frmTime = H * np.arange(numFrames) / float(fs) binFreq = fs * np.arange(N * maxplotfreq / fs) / N plt.pcolormesh( frmTime, binFreq, np.transpose(np.diff(pX[:, :N * maxplotfreq / fs + 1], axis=1))) plt.xlabel('time (sec)') plt.ylabel('frequency (Hz)') plt.title('phase spectrogram (derivative)') plt.autoscale(tight=True) # plot the output sound plt.subplot(4, 1, 4) plt.plot(np.arange(y.size) / float(fs), y) plt.axis([0, y.size / float(fs), min(y), max(y)]) plt.ylabel('amplitude') plt.xlabel('time (sec)') plt.title('output sound: y') plt.tight_layout() if interactive: plt.show() if plotFile: plt.savefig('output_plots/%s_stft_model.png' % files.strip_file(inputFile))
(fs, x) = audio.read_wav('../../../sounds/flute-A4.wav') w = np.blackman(551) N = 1024 t = -100 nH = 40 minf0 = 420 maxf0 = 460 f0et = 5 maxnpeaksTwm = 5 minSineDur = .1 harmDevSlope = 0.01 Ns = 512 H = Ns / 4 mX, pX = stft.from_audio(x, w, N, H) hfreq, hmag, hphase = harmonic.from_audio(x, fs, w, N, H, t, nH, minf0, maxf0, f0et, harmDevSlope, minSineDur) xr = residual.subtract_sinusoids(x, Ns, H, hfreq, hmag, hphase, fs) mXr, pXr = stft.from_audio(xr, hamming(Ns), Ns, H) maxplotfreq = 5000.0 plt.figure(1, figsize=(9, 7)) plt.subplot(221) numFrames = int(mX.shape[0]) frmTime = H * np.arange(numFrames) / float(fs) binFreq = fs * np.arange(N * maxplotfreq / fs) / N plt.pcolormesh(frmTime, binFreq, np.transpose(mX[:, :N * maxplotfreq / fs + 1])) plt.autoscale(tight=True) harms = hfreq * np.less(hfreq, maxplotfreq)
t = -90 minSineDur = 0.1 nH = 40 minf0 = 350 maxf0 = 700 f0et = 8 harmDevSlope = 0.1 Ns = 512 H = 128 (fs, x) = audio.read_wav(inputFile) w = get_window(window, M) hfreq, hmag, hphase, xr = hpr.from_audio(x, fs, w, N, H, t, minSineDur, nH, minf0, maxf0, f0et, harmDevSlope) mXr, pXr = stft.from_audio(xr, w, N, H) freqScaling = np.array([0, 1.5, 1, 1.5]) freqStretching = np.array([0, 1.1, 1, 1.1]) timbrePreservation = 1 hfreqt, hmagt = harmonic.scale_frequencies(hfreq, hmag, freqScaling, freqStretching, timbrePreservation, fs) y, yh = hpr.to_audio(hfreqt, hmagt, np.array([]), xr, Ns, H, fs) audio.write_wav(y, fs, 'hpr-freq-transformation.wav') plt.figure(figsize=(12, 9))
from smst.utils import audio from smst.models import sine, stft (fs, x) = audio.read_wav('../../../sounds/speech-male.wav') start = 1.25 end = 1.79 x1 = x[start * fs:end * fs] w = np.hamming(801) N = 2048 H = 200 t = -70 minSineDur = 0 maxnSines = 150 freqDevOffset = 10 freqDevSlope = 0.001 mX, pX = stft.from_audio(x1, w, N, H) tfreq, tmag, tphase = sine.from_audio(x1, fs, w, N, H, t, maxnSines, minSineDur, freqDevOffset, freqDevSlope) plt.figure(1, figsize=(9.5, 7)) maxplotfreq = 800.0 maxplotbin = int(N * maxplotfreq / fs) numFrames = int(mX.shape[0]) frmTime = H * np.arange(numFrames) / float(fs) binFreq = np.arange(maxplotbin + 1) * float(fs) / N plt.pcolormesh(frmTime, binFreq, np.transpose(mX[:, :maxplotbin + 1])) plt.autoscale(tight=True) tracks = tfreq * np.less(tfreq, maxplotfreq) tracks[tracks <= 0] = np.nan plt.plot(frmTime, tracks, 'x', color='k', markeredgewidth=1.5)
from smst.utils import audio from smst.models import stft, stochastic (fs, x1) = audio.read_wav('../../../sounds/orchestra.wav') (fs, x2) = audio.read_wav('../../../sounds/speech-male.wav') w1 = np.hamming(1024) N1 = 1024 H1 = 256 w2 = np.hamming(1024) N2 = 1024 smoothf = .2 balancef = 0.5 y = stft.morph(x1, x2, fs, w1, N1, w2, N2, H1, smoothf, balancef) mX2 = stochastic.from_audio(x2, H1, H1 * 2, smoothf) mX, pX = stft.from_audio(x1, w1, N1, H1) mY, pY = stft.from_audio(y, w1, N1, H1) maxplotfreq = 10000.0 plt.figure(1, figsize=(12, 9)) plt.subplot(311) numFrames = int(mX.shape[0]) frmTime = H1 * np.arange(numFrames) / float(fs) binFreq = fs * np.arange(N1 * maxplotfreq / fs) / N1 plt.pcolormesh(frmTime, binFreq, np.transpose(mX[:, :N1 * maxplotfreq / fs + 1])) plt.title('mX (orchestra.wav)') plt.autoscale(tight=True) plt.subplot(312) numFrames = int(mX2.shape[0])