def test_reconstruct_sound(): fs, x = audio.read_wav(sound_path("sax-phrase-short.wav")) window_size, fft_size, hop_size = 4001, 4096, 2048 window = get_window('hamming', window_size) xtfreq, xtmag, xtphase = harmonic.from_audio( x, fs, window, fft_size, hop_size, t=-80, nH=20, minf0=100, maxf0=2000, f0et=5, harmDevSlope=0.01, minSineDur=.02) x_reconstructed = sine.to_audio(xtfreq, xtmag, xtphase, fft_size, hop_size, fs) assert 138746 == len(x) expected_frame_count = int(math.ceil(float(len(x)) / hop_size)) assert expected_frame_count == len(xtfreq) assert expected_frame_count == len(xtmag) assert expected_frame_count == len(xtphase) assert xtfreq.shape[1] <= 100 # statistics of the model for regression testing without explicitly storing the whole data assert np.allclose(1738.618043903208, xtfreq.mean()) assert np.allclose(-64.939768348945279, xtmag.mean()) assert np.allclose(1.6687005886001871, xtphase.mean()) # TODO: this is completely off, it should be equal to len(x)! assert 69 * 2048 == len(x_reconstructed) assert np.allclose(0.036941947007791701, rmse(x, x_reconstructed[:len(x)]))
def test_reconstruct_sound(): fs, x = audio.read_wav(sound_path("sax-phrase-short.wav")) window_size, fft_size, hop_size = 4001, 4096, 2048 window = get_window('hamming', window_size) xtfreq, xtmag, xtphase = sine.from_audio( x, fs, window, fft_size, hop_size, t=-80, maxnSines=100, minSineDur=.01, freqDevOffset=20, freqDevSlope=0.01) x_reconstructed = sine.to_audio(xtfreq, xtmag, xtphase, fft_size, hop_size, fs) assert 138746 == len(x) expected_frame_count = int(math.ceil(float(len(x)) / hop_size)) assert expected_frame_count == len(xtfreq) assert expected_frame_count == len(xtmag) assert expected_frame_count == len(xtphase) assert xtfreq.shape[1] <= 100 # statistics of the model for regression testing without explicitly storing the whole data assert np.allclose(945.892990545, xtfreq.mean()) assert np.allclose(-30.3138495002, xtmag.mean()) assert np.allclose(1.34449391701, xtphase.mean()) # TODO: this is completely off, it should be equal to len(x)! assert 69 * 2048 == len(x_reconstructed) assert np.allclose(0.010812475879315771, rmse(x, x_reconstructed[:len(x)]))
def analysis(inputFile=demo_sound_path('mridangam.wav'), window='hamming', M=801, N=2048, t=-90, minSineDur=0.01, maxnSines=150, freqDevOffset=20, freqDevSlope=0.02, interactive=True, plotFile=False): """ Analyze a sound with the sine model inputFile: input sound file (monophonic with sampling rate of 44100) window: analysis window type (rectangular, hanning, hamming, blackman, blackmanharris) M: analysis window size; N: fft size (power of two, bigger or equal than M) t: magnitude threshold of spectral peaks; minSineDur: minimum duration of sinusoidal tracks maxnSines: maximum number of parallel sinusoids freqDevOffset: frequency deviation allowed in the sinusoids from frame to frame at frequency 0 freqDevSlope: slope of the frequency deviation, higher frequencies have bigger deviation returns inputFile: input file name; fs: sampling rate of input file, tfreq, tmag: sinusoidal frequencies and magnitudes """ # size of fft used in synthesis Ns = 512 # hop size (has to be 1/4 of Ns) H = 128 # read input sound (fs, x) = audio.read_wav(inputFile) # compute analysis window w = get_window(window, M) # compute the sine model of the whole sound tfreq, tmag, tphase = sine.from_audio(x, fs, w, N, H, t, maxnSines, minSineDur, freqDevOffset, freqDevSlope) # synthesize the sines without original phases y = sine.to_audio(tfreq, tmag, np.array([]), Ns, H, fs) # output sound file (monophonic with sampling rate of 44100) outputFile = 'output_sounds/' + strip_file(inputFile) + '_sineModel.wav' # write the sound resulting from the inverse stft audio.write_wav(y, fs, outputFile) # create figure to show plots plt.figure(figsize=(12, 9)) # frequency range to plot maxplotfreq = 5000.0 # plot the input sound plt.subplot(3, 1, 1) plt.plot(np.arange(x.size) / float(fs), x) plt.axis([0, x.size / float(fs), min(x), max(x)]) plt.ylabel('amplitude') plt.xlabel('time (sec)') plt.title('input sound: x') # plot the sinusoidal frequencies if (tfreq.shape[1] > 0): plt.subplot(3, 1, 2) tracks = np.copy(tfreq) tracks = tracks * np.less(tracks, maxplotfreq) tracks[tracks <= 0] = np.nan numFrames = int(tracks.shape[0]) frmTime = H * np.arange(numFrames) / float(fs) plt.plot(frmTime, tracks) plt.axis([0, x.size / float(fs), 0, maxplotfreq]) plt.title('frequencies of sinusoidal tracks') # plot the output sound plt.subplot(3, 1, 3) plt.plot(np.arange(y.size) / float(fs), y) plt.axis([0, y.size / float(fs), min(y), max(y)]) plt.ylabel('amplitude') plt.xlabel('time (sec)') plt.title('output sound: y') plt.tight_layout() if interactive: plt.show(block=False) if plotFile: plt.savefig('output_plots/%s_sine_transformation_analysis.png' % files.strip_file(inputFile)) return inputFile, fs, tfreq, tmag
def transformation_synthesis(inputFile, fs, tfreq, tmag, freqScaling=np.array([0, 2.0, 1, .3]), timeScaling=np.array( [0, .0, .671, .671, 1.978, 1.978 + 1.0]), interactive=True, plotFile=False): """ Transform the analysis values returned by the analysis function and synthesize the sound inputFile: name of input file; fs: sampling rate of input file tfreq, tmag: sinusoidal frequencies and magnitudes freqScaling: frequency scaling factors, in time-value pairs timeScaling: time scaling factors, in time-value pairs """ # size of fft used in synthesis Ns = 512 # hop size (has to be 1/4 of Ns) H = 128 # frequency scaling of the sinusoidal tracks ytfreq = sine.scale_frequencies(tfreq, freqScaling) # time scale the sinusoidal tracks ytfreq, ytmag = sine.scale_time(ytfreq, tmag, timeScaling) # synthesis y = sine.to_audio(ytfreq, ytmag, np.array([]), Ns, H, fs) # write output sound outputFile = 'output_sounds/' + strip_file( inputFile) + '_sineModelTransformation.wav' audio.write_wav(y, fs, outputFile) # create figure to plot plt.figure(figsize=(12, 6)) # frequency range to plot maxplotfreq = 15000.0 # plot the transformed sinusoidal frequencies if (ytfreq.shape[1] > 0): plt.subplot(2, 1, 1) tracks = np.copy(ytfreq) tracks = tracks * np.less(tracks, maxplotfreq) tracks[tracks <= 0] = np.nan numFrames = int(tracks.shape[0]) frmTime = H * np.arange(numFrames) / float(fs) plt.plot(frmTime, tracks) plt.title('transformed sinusoidal tracks') plt.autoscale(tight=True) # plot the output sound plt.subplot(2, 1, 2) plt.plot(np.arange(y.size) / float(fs), y) plt.axis([0, y.size / float(fs), min(y), max(y)]) plt.ylabel('amplitude') plt.xlabel('time (sec)') plt.title('output sound: y') plt.tight_layout() if interactive: plt.show() if plotFile: plt.savefig('output_plots/%s_sine_transformation_synthesis.png' % files.strip_file(inputFile))
def main(inputFile=demo_sound_path('bendir.wav'), window='hamming', M=2001, N=2048, t=-80, minSineDur=0.02, maxnSines=150, freqDevOffset=10, freqDevSlope=0.001, interactive=True, plotFile=False): """ Perform analysis/synthesis using the sinusoidal model inputFile: input sound file (monophonic with sampling rate of 44100) window: analysis window type (rectangular, hanning, hamming, blackman, blackmanharris) M: analysis window size; N: fft size (power of two, bigger or equal than M) t: magnitude threshold of spectral peaks; minSineDur: minimum duration of sinusoidal tracks maxnSines: maximum number of parallel sinusoids freqDevOffset: frequency deviation allowed in the sinusoids from frame to frame at frequency 0 freqDevSlope: slope of the frequency deviation, higher frequencies have bigger deviation """ # size of fft used in synthesis Ns = 512 # hop size (has to be 1/4 of Ns) H = 128 # read input sound fs, x = audio.read_wav(inputFile) # compute analysis window w = get_window(window, M) # analyze the sound with the sinusoidal model tfreq, tmag, tphase = sine.from_audio(x, fs, w, N, H, t, maxnSines, minSineDur, freqDevOffset, freqDevSlope) # synthesize the output sound from the sinusoidal representation y = sine.to_audio(tfreq, tmag, tphase, Ns, H, fs) # output sound file name outputFile = 'output_sounds/' + strip_file(inputFile) + '_sineModel.wav' # write the synthesized sound obtained from the sinusoidal synthesis audio.write_wav(y, fs, outputFile) # create figure to show plots plt.figure(figsize=(12, 9)) # frequency range to plot maxplotfreq = 5000.0 # plot the input sound plt.subplot(3, 1, 1) plt.plot(np.arange(x.size) / float(fs), x) plt.axis([0, x.size / float(fs), min(x), max(x)]) plt.ylabel('amplitude') plt.xlabel('time (sec)') plt.title('input sound: x') # plot the sinusoidal frequencies plt.subplot(3, 1, 2) if (tfreq.shape[1] > 0): numFrames = tfreq.shape[0] frmTime = H * np.arange(numFrames) / float(fs) tfreq[tfreq <= 0] = np.nan plt.plot(frmTime, tfreq) plt.axis([0, x.size / float(fs), 0, maxplotfreq]) plt.title('frequencies of sinusoidal tracks') # plot the output sound plt.subplot(3, 1, 3) plt.plot(np.arange(y.size) / float(fs), y) plt.axis([0, y.size / float(fs), min(y), max(y)]) plt.ylabel('amplitude') plt.xlabel('time (sec)') plt.title('output sound: y') plt.tight_layout() if interactive: plt.show() if plotFile: plt.savefig('output_plots/%s_sine_model.png' % files.strip_file(inputFile))
(fs, x) = audio.read_wav('../../../sounds/vignesh.wav') w = np.blackman(1201) N = 2048 t = -90 nH = 100 minf0 = 130 maxf0 = 300 f0et = 7 Ns = 512 H = Ns / 4 minSineDur = .1 harmDevSlope = 0.01 hfreq, hmag, hphase = harmonic.from_audio(x, fs, w, N, H, t, nH, minf0, maxf0, f0et, harmDevSlope, minSineDur) y = sine.to_audio(hfreq, hmag, hphase, Ns, H, fs) numFrames = int(hfreq.shape[0]) frmTime = H * np.arange(numFrames) / float(fs) plt.figure(1, figsize=(9, 7)) plt.subplot(3, 1, 1) plt.plot(np.arange(x.size) / float(fs), x, 'b') plt.axis([0, x.size / float(fs), min(x), max(x)]) plt.title('x (vignesh.wav)') plt.subplot(3, 1, 2) yhfreq = hfreq yhfreq[hfreq == 0] = np.nan plt.plot(frmTime, hfreq, lw=1.2)
from smst.models import sine (fs, x) = audio.read_wav('../../../sounds/bendir.wav') x1 = x[0:50000] w = np.blackman(2001) N = 2048 H = 500 t = -90 minSineDur = .01 maxnSines = 150 freqDevOffset = 20 freqDevSlope = 0.02 Ns = 512 H = Ns / 4 tfreq, tmag, tphase = sine.from_audio(x1, fs, w, N, H, t, maxnSines, minSineDur, freqDevOffset, freqDevSlope) y = sine.to_audio(tfreq, tmag, tphase, Ns, H, fs) numFrames = int(tfreq.shape[0]) frmTime = H * np.arange(numFrames) / float(fs) maxplotfreq = 3000.0 plt.figure(1, figsize=(9, 7)) plt.subplot(3, 1, 1) plt.plot(np.arange(x1.size) / float(fs), x1, 'b', lw=1.5) plt.axis([0, x1.size / float(fs), min(x1), max(x1)]) plt.title('x (bendir.wav)') plt.subplot(3, 1, 2) tracks = tfreq * np.less(tfreq, maxplotfreq) tracks[tracks <= 0] = np.nan
w = np.hamming(801) N = 2048 t = -90 minSineDur = .005 maxnSines = 150 freqDevOffset = 20 freqDevSlope = 0.02 Ns = 512 H = Ns / 4 mX, pX = stft.from_audio(x, w, N, H) tfreq, tmag, tphase = sine.from_audio(x, fs, w, N, H, t, maxnSines, minSineDur, freqDevOffset, freqDevSlope) timeScale = np.array( [.01, .0, .03, .03, .335, .4, .355, .42, .671, .8, .691, .82, .858, 1.2, .878, 1.22, 1.185, 1.6, 1.205, 1.62, 1.497, 2.0, 1.517, 2.02, 1.686, 2.4, 1.706, 2.42, 1.978, 2.8]) ytfreq, ytmag = sine.scale_time(tfreq, tmag, timeScale) y = sine.to_audio(ytfreq, ytmag, np.array([]), Ns, H, fs) mY, pY = stft.from_audio(y, w, N, H) plt.figure(1, figsize=(12, 9)) maxplotfreq = 4000.0 plt.subplot(4, 1, 1) plt.plot(np.arange(x.size) / float(fs), x, 'b') plt.axis([0, x.size / float(fs), min(x), max(x)]) plt.title('x (mridangam.wav)') plt.subplot(4, 1, 2) numFrames = int(tfreq.shape[0]) frmTime = H * np.arange(numFrames) / float(fs) tracks = tfreq * np.less(tfreq, maxplotfreq) tracks[tracks <= 0] = np.nan plt.plot(frmTime, tracks, color='k', lw=1)
def main(inputFile=demo_sound_path('vignesh.wav'), window='blackman', M=1201, N=2048, t=-90, minSineDur=0.1, nH=100, minf0=130, maxf0=300, f0et=7, harmDevSlope=0.01, interactive=True, plotFile=False): """ Analysis and synthesis using the harmonic model inputFile: input sound file (monophonic with sampling rate of 44100) window: analysis window type (rectangular, hanning, hamming, blackman, blackmanharris) M: analysis window size; N: fft size (power of two, bigger or equal than M) t: magnitude threshold of spectral peaks; minSineDur: minimum duration of sinusoidal tracks nH: maximum number of harmonics; minf0: minimum fundamental frequency in sound maxf0: maximum fundamental frequency in sound; f0et: maximum error accepted in f0 detection algorithm harmDevSlope: allowed deviation of harmonic tracks, higher harmonics could have higher allowed deviation """ # size of fft used in synthesis Ns = 512 # hop size (has to be 1/4 of Ns) H = 128 # read input sound (fs, x) = audio.read_wav(inputFile) # compute analysis window w = get_window(window, M) # detect harmonics of input sound hfreq, hmag, hphase = harmonic.from_audio(x, fs, w, N, H, t, nH, minf0, maxf0, f0et, harmDevSlope, minSineDur) # synthesize the harmonics y = sine.to_audio(hfreq, hmag, hphase, Ns, H, fs) # output sound file (monophonic with sampling rate of 44100) outputFile = 'output_sounds/' + files.strip_file( inputFile) + '_harmonicModel.wav' # write the sound resulting from harmonic analysis audio.write_wav(y, fs, outputFile) # create figure to show plots plt.figure(figsize=(12, 9)) # frequency range to plot maxplotfreq = 5000.0 # plot the input sound plt.subplot(3, 1, 1) plt.plot(np.arange(x.size) / float(fs), x) plt.axis([0, x.size / float(fs), min(x), max(x)]) plt.ylabel('amplitude') plt.xlabel('time (sec)') plt.title('input sound: x') # plot the harmonic frequencies plt.subplot(3, 1, 2) if (hfreq.shape[1] > 0): numFrames = hfreq.shape[0] frmTime = H * np.arange(numFrames) / float(fs) hfreq[hfreq <= 0] = np.nan plt.plot(frmTime, hfreq) plt.axis([0, x.size / float(fs), 0, maxplotfreq]) plt.title('frequencies of harmonic tracks') # plot the output sound plt.subplot(3, 1, 3) plt.plot(np.arange(y.size) / float(fs), y) plt.axis([0, y.size / float(fs), min(y), max(y)]) plt.ylabel('amplitude') plt.xlabel('time (sec)') plt.title('output sound: y') plt.tight_layout() if interactive: plt.show() if plotFile: plt.savefig('output_plots/%s_harmonic_model.png' % files.strip_file(inputFile))