def expe_2(): sig_1_path = '/sons/voxforge/main/Learn/cmu_us_slt_arctic/wav/arctic_a0372.wav' sig_2_path = '/sons/voxforge/main/Learn/cmu_us_rms_arctic/wav/arctic_a0372.wav' i = 0 for sig_path in [sig_1_path, sig_2_path]: synth_sig = Signal(sig_path, normalize=True, mono=True) #synth_sig.crop(0.1*synth_sig.fs, 3.5*synth_sig.fs) #synth_sig.resample(32000) plt.figure(figsize=(10, 10)) plt.subplot(211) plt.plot( np.arange(.0, synth_sig.length) / float(synth_sig.fs), synth_sig.data) plt.xticks([]) plt.ylim([-1, 1]) plt.grid() plt.subplot(212) synth_sig.spectrogram(1024, 64, order=0.5, log=False, cmap=cm.hot, cbar=False) plt.savefig(op.join(figure_output_path, 'voice_%d_spectro.pdf' % i)) synth_sig.write(op.join(audio_output_path, 'voice_%d_spectro.wav' % i)) i += 1
def recons_save_fig_audio(magspec, target_name, n_max_frames, fs=22050, format=(8, 3), nb_gl_iter=30): init_vec = np.random.randn(128 * n_max_frames) x_recon = transforms.gl_recons(magspec[:, :n_max_frames], init_vec, nb_gl_iter, 512, 128, display=False) rec_sig = Signal(x_recon, fs, normalize=True) rec_sig.write(os.path.join(output_audio_path, '%s.wav' % target_name)) plt.figure(figsize=format) rec_sig.spectrogram(512, 128, order=1, log=True, cmap=cm.jet, cbar=False) plt.savefig(os.path.join(output_fig_path, '%s.png' % target_name))
def save_audio(learntype, np, test_file, n_feat, rescale_str, sigout, fs, norm_segments=False): """ saving output vector to an audio wav""" norm_str = '' if norm_segments: norm_str = 'normed' mean_energy = np.mean( [np.sum(sig**2) / float(len(sig)) for sig in sigout]) for sig in sigout: sig /= np.sum(sig**2) / float(len(sig)) sig *= mean_energy rec_sig = Signal(np.concatenate(sigout), fs, normalize=True) rec_sig.write('%s/%s_with%s_%dfeats_%s%s.wav' % (outputpath, os.path.split(test_file)[-1], learntype, n_feat, rescale_str, norm_str))
def save_audio_full_ref(learntype, test_file, n_feat, rescale_str, sigout, fs, norm_segments=False): """ do not cut the sounds """ # first pass for total length max_idx = int(sigout[-1][1] + len(sigout[-1][0])) + 4 * fs print "total length of ", max_idx sig_data = np.zeros((max_idx, )) # seg_energy = np.sum(sigout[-1][0]**2) for (sig, startidx) in sigout: # print sig.shape, sig_data[int(startidx):int(startidx)+sig.shape[0]].shape sig_data[int(startidx):int(startidx) + sig.shape[0]] += sig #*seg_energy/np.sum(sig**2) rec_sig = Signal(sig_data, fs, normalize=True) rec_sig.write('%s/%s_with%s_%dfeats_%s%s.wav' % (outputpath, os.path.split(test_file)[-1], learntype, n_feat, rescale_str, 'full_ref'))
if dist_idx[-1] * ratio > distorted_spec.shape[0]: break new_spec = np.abs(learn_specs[[int(j * ratio) for j in dist_idx], :]) distorted_spec[dist_idx, :] = new_spec plt.figure() plt.imshow(np.log(distorted_spec.T), origin='lower') plt.show() # resynthesize init_vec = np.random.randn(distorted_spec.shape[0] * tstep) rec_method2 = transforms.gl_recons(distorted_spec.T, init_vec, 10, wsize, tstep, display=False) rec_sig_2 = Signal(rec_method2, original.fs, mono=True, normalize=True) rec_sig_2.write('/sons/tests/rec_sig2.wav') # let us see if the spectrum joined together look alike #seg_idx = 10 #plt.figure() #plt.subplot(121) #plt.imshow(np.log(np.abs(learn_specs[spec_idx[seg_idx],:]).T)) #plt.subplot(122) #plt.plot(np.median(np.log(np.abs(learn_specs[spec_idx[seg_idx],:]).T), axis=1)) #plt.plot(np.log(np.mean(np.abs(learn_specs[spec_idx[seg_idx],:].T), axis=1)),'k') #plt.plot(np.mean(np.log(np.abs(learn_specs[spec_idx[seg_idx],:]).T), axis=1),'r') #plt.show()
sig_learn_ref = Signal(ref_learn_data, sr) ref_test_data = Datas[start_t_sample:start_t_sample + test_sample] sig_test_ref = Signal(ref_test_data, sr) nb_median = 5 nb_iter_gl = 20 l_medfilt = 1 params = {} params['win_size'] = int(wintime * sr) params['step_size'] = int(steptime * sr) res_array = regression.eval_knn(learn_feats, learn_magspecs, test_feats, test_magspecs, ref_test_data, nb_median, nb_iter_gl, l_medfilt, params) output_path = '/home/manu/workspace/audio-sketch/src/results/' res_sig = Signal(res_array[1], sr, mono=True, normalize=True) res_sig.write(output_path + 'audio/test_rwc-g-m01_4_learn_%s%s_%dmedian.wav' % (add_sample_str, add_col_str, nb_median)) sig_test_ref.write(output_path + 'audio/ref_rwc-g-m04_4_learn_%d.wav' % int(100 * learn_ratio)) #plt.figure() #plt.plot(res_array[2]) #plt.show() ## terrible idea: use the waveforms directly? #Xdev = learn_feats #Ydev = learn_feats #estimated_windowed_wf = regression.ann(Xdev, Ydev, X, Y, display=False, K=1)
def expe_1_synth_from_same_sample(): input_dir = '/sons/rwc/Learn/' output_dir = '/sons/rwc/Learn/hdf5/' audiofile = input_dir + 'rwc-g-m01_1.wav' h5file = output_dir + 'rwc-g-m01_1.h5' # load the Echo Nest features h5 = hdf5_getters.open_h5_file_read(h5file) timbre = hdf5_getters.get_segments_timbre(h5) loudness_start = hdf5_getters.get_segments_loudness_start(h5) loudness_max = hdf5_getters.get_segments_loudness_max(h5) loudness_max_time = hdf5_getters.get_segments_loudness_max_time(h5) C = hdf5_getters.get_segments_pitches(h5) segments_all = hdf5_getters.get_segments_start(h5) learn_feats_all = np.hstack((timbre, loudness_start.reshape((loudness_start.shape[0],1)), C)) # Ok That was the best possible case, now let us try to find the nearest neighbors, # get the segment back and resynthesize! learn_duration = 200 # in seconds test_start = 200 test_duration = 5 # Get learning data learning = Signal(audiofile, mono=True) learning.crop(0, learn_duration*learning.fs) wsize = 1024 tstep = 512 # Get the magnitude spectrum for the given audio file learn_specs = features.get_stft(learning.data, wsize, tstep) learn_specs = learn_specs.T max_l_seg_idx = np.where(segments_all < learn_duration)[0][-1] l_segments = segments_all[:max_l_seg_idx] l_segment_lengths = (l_segments[1:] - l_segments[0:-1])*learning.fs learn_feats = learn_feats_all[:max_l_seg_idx,:] # we must keep in mind for each segment index, the corresponding indices in the learn_spec mat l_seg_bounds = [] ref_time = np.arange(0., float(learning.length)/float(learning.fs), float(tstep)/float(learning.fs)) for segI in range(len(l_segments)-1): startIdx = np.where(ref_time > l_segments[segI])[0][0] endIdx = np.where(ref_time > l_segments[segI+1])[0][0] l_seg_bounds.append((startIdx,endIdx)) l_seg_bounds.append((endIdx, ref_time.shape[0])) # Get testing data testing = Signal(audiofile, mono=True) testing.crop(test_start*testing.fs, (test_start+test_duration)*learning.fs) # get the testing features min_t_seg_idx = np.where(segments_all < test_start)[0][-1] max_t_seg_idx = np.where(segments_all < test_start + test_duration)[0][-1] t_segments = segments_all[min_t_seg_idx:max_t_seg_idx] t_segment_lengths = (t_segments[1:] - t_segments[0:-1])*testing.fs test_feats = learn_feats_all[min_t_seg_idx:max_t_seg_idx,:] # find the nearest neighbors from sklearn.neighbors import NearestNeighbors neigh = NearestNeighbors(1) # fit on the learning data neigh.fit(learn_feats) neighb_segments_idx = neigh.kneighbors(test_feats, return_distance=False) # kneighs is a set of segment indices, we need to get the spectrogram back from the learning data # then fit the new segment lengths target_length = int(test_duration*testing.fs) neighb_segments = zip(neighb_segments_idx[:,0], t_segment_lengths.astype(int)) morphed_spectro = spec_morph(np.abs(learn_specs), target_length, neighb_segments, l_seg_bounds) # retrieve true stft for comparison test_specs = features.get_stft(testing.data, wsize, tstep) plt.figure() plt.subplot(121) plt.imshow(np.log(np.abs(test_specs)), origin='lower') plt.colorbar() plt.subplot(122) plt.imshow(np.log(morphed_spectro.T), origin='lower') plt.colorbar() plt.show() init_vec = np.random.randn(morphed_spectro.shape[0]*tstep) rec_method2 = transforms.gl_recons(morphed_spectro.T, init_vec, 10, wsize, tstep, display=False) rec_sig_2 = Signal(rec_method2, testing.fs, mono=True, normalize=True) rec_sig_2.write('/sons/tests/rec_sig2.wav')
print "Loading ", filepath signalin, fs = get_audio(filepath, ref_audio_start, ref_audio_duration) target_length = target_audio_duration*fs print "Loaded %s length of %d "%( filepath, len(signalin)) print "Stretching to %2.2f"%target_length # adjust the Loudness ? if rescale: rescale_str = 'normed' signalin = signalin.astype(float) signalin /= 8192.0 signalin /= np.max(signalin) # N = float(len(signalin)) # target_loudness = test_feats[test_seg_idx, 13] # adjust = target_loudness - 10*np.log10((1.0/N)*np.sum(signalin**2)) # signalin *= 10**(adjust/10.) signalin *= 8192.0 signalin = signalin.astype(np.int16) sigout[num_neigh].append(time_stretch(signalin, tscale, wsize=1024, tstep=128)[128:-1024]) for num_neigh in range(n_neighbs): rec_sig = Signal(np.concatenate(sigout[num_neigh]), fs, normalize=True) rec_sig.write('/home/manu/workspace/audio-sketch/src/results/audio/%s_with%s_%dfeats_%s_neighbor_%d.wav'%( os.path.split(test_file)[-1], learntype, n_feat, rescale_str, num_neigh))
output_audio_path = '/home/manu/Documents/Articles/ISMIR2013/ListeningMSD/Audio/' output_fig_path = '/home/manu/Documents/Articles/ISMIR2013/ListeningMSD/Figures/' colormap = cm.jet format = (8,3) # also load the Dan Ellis's synthesized version # The Piano cross-synthesis and the Viterbi smoothed Musaicing? # resynthesize using the first N frames n_max_frames = 900 nb_gl_iter = 30 init_vec = np.random.randn(128*n_max_frames) x_recon_median = transforms.gl_recons(median_magspec[:,:n_max_frames], init_vec, nb_gl_iter, 512, 128, display=False) sig_median = Signal(x_recon_median, 22050,normalize=True) sig_median.write(os.path.join(output_audio_path, '%s_add_median.wav'%t_name)) plt.figure(figsize=format) sig_median.spectrogram(512, 128, order=1, log=True, cmap=colormap, cbar=False) plt.savefig(os.path.join(output_fig_path, '%s_add_median.png'%t_name)) init_vec = np.random.randn(128*n_max_frames) x_recon_orig = transforms.gl_recons(orig_spec[:,:n_max_frames], init_vec, nb_gl_iter, 512, 128, display=False) sig_orig= Signal(x_recon_orig, 22050,normalize=True) sig_orig.write(os.path.join(output_audio_path, '%s_original.wav'%t_name)) plt.figure(figsize=format) sig_orig.spectrogram(512, 128, order=1, log=True, cmap=colormap, cbar=False) plt.savefig(os.path.join(output_fig_path, '%s_original.png'%t_name)) init_vec = np.random.randn(128*n_max_frames) x_recon_max = transforms.gl_recons(max_magspec[:,:n_max_frames], init_vec, nb_gl_iter,
sr = 16000 win_size = steptime * 2 * sr step_size = steptime * sr # sliding median filtering ? from scipy.ndimage.filters import median_filter estimated_spectrum_filt = median_filter(estimated_spectrum, (1, 20)) plt.figure() plt.imshow(np.log(estimated_spectrum_filt), origin='lower') plt.show() # reconstruction #init_vec = np.random.randn(step_size*Y_hat.shape[1]) init_vec = np.random.randn(step_size * estimated_spectrum.shape[1]) #x_recon = transforms.gl_recons(estimated_spectrum_filt, init_vec, 20, # win_size, step_size, display=False) x_recon = transforms.gl_recons_vary_size(estimated_spectrum, n_segments_start, 20, win_size, step_size, display=False) output_path = '/home/manu/workspace/audio-sketch/src/results/' res_sig = Signal(x_recon, sr, mono=True, normalize=True) res_sig.write(output_path + 'audio/resynth_%s_%dmedian.wav' % (title, nb_median)) #sig_test_ref.write(output_path+'audio/resynth_%s_learn_%d.wav'%int(100*learn_ratio))
# sim_mat[t,:] = np.sum((t_feats - t_feats[t,:])**2, axis=1) # #plt.figure() #plt.imshow(sim_mat, origin='lower') #plt.colorbar() #plt.show() # now try to viterbi decode this shit from tools.learning_tools import Viterbi vit_path = Viterbi(neigh, distance, trans_penalty=0.01, c_value=20) vit_cands = [neigh[ind, neighbind] for ind, neighbind in enumerate(vit_path)] # sig_out_viterbi = resynth_sequence(np.squeeze(vit_cands), t_seg_starts, t_seg_duration, l_segments, l_feats, ref_audio_dir, '.au', 22050, dotime_stretch=True, max_synth_idx=40, normalize=True) sig_viterbi = Signal(sig_out_viterbi, 22050, normalize=True) sig_viterbi.write( '%s/%s_viterbi_%dFeats_%dLearns_Filter%d.wav' % (outputpath, h5files[t_index - 1], nbFeats, n_learn, filter_key)) sig_viterbi.crop(0, 9.5 * sig_viterbi.fs) # #sig_viterbi = save_audio(outputpath, '%s_viterbi'%h5files[t_index], sig_out_viterbi, 22050, norm_segments=False)
params['forbidden_names'] = [os.path.basename(i) for i in learn_files] [test_feats_all, test_magspecs, n_f_test, ref_t_data, test_files] = load_yaafedata(params) # search for any test file that is already in the learning set isinbase = any([ os.path.basename(p) in params['forbidden_names'] for p in test_files ]) save_test_name = 'test_audio_seed_%d_%d_trial%s' % ( learn_seed, params['shuffle'], trialIdx) # also save the audio res_sig = Signal(ref_t_data, params['sr'], mono=True, normalize=True) res_sig.write(output_path + 'audio/' + save_test_name + '.wav') for nli in range(len(nb_learns)): nb_learn = nb_learns[nli] for mfi in range(len(nb_features)): nb_feat = nb_features[mfi] learn_feats = learn_feats_all[0:nb_learn, 0:nb_feat] learn_magspecs = learn_magspecs_all[:, 0:nb_learn] test_feats = test_feats_all[:, 0:nb_feat] for nmi in range(len(nb_medians)): nb_median = nb_medians[nmi] # Getting the spectrum with all features considered
sys.path.append('/home/manu/workspace/PyMP') sys.path.append('/home/manu/workspace/meeg_denoise') import stft # load the sinewave speech sinewave = Signal('/sons/sqam/vegaSWS.wav', mono=True) spectro = stft.stft(sinewave.data, wsize=1024, tstep=256)[0, :, :] init_vec = np.random.randn(sinewave.data.shape[0]) rec_gl_data = transforms.gl_recons(np.abs(spectro), init_vec, niter=20, wsize=1024, tstep=256) sig_rec = Signal(rec_gl_data, sinewave.fs, mono=True, normalize=True) sig_rec.write('/sons/sqam/vegaSWS_gl.wav') # ok it's working just fine' # now compare with reconstruction from original spectrogram original = Signal('/sons/sqam/vega.wav', mono=True) spectro = stft.stft(original.data, wsize=1024, tstep=256)[0, :, :] init_vec = np.random.randn(original.data.shape[0]) rec_gl_data = transforms.gl_recons(np.abs(spectro), init_vec, niter=20, wsize=1024, tstep=256) sig_rec = Signal(rec_gl_data, sinewave.fs, mono=True, normalize=True) sig_rec.write('/sons/sqam/vega_gl.wav')