def convergence_callback(Y): global SDR, SIR from mir_eval.separation import bss_eval_sources ref = np.moveaxis(separate_recordings, 1, 2) y = np.array([pra.istft(Y[:,:,ch], L, L, transform=np.fft.irfft, zp_front=L//2, zp_back=L//2) for ch in range(Y.shape[2])]) sdr, sir, sar, perm = bss_eval_sources(ref[:,:y.shape[1]-L//2,0], y[:,L//2:ref.shape[1]+L//2]) SDR.append(sdr) SIR.append(sir)
def test_stft_nowindow(self): frames = 100 fftsize = [128, 256, 512] hop_div = [1, 2] loops = 10 for n in fftsize: for div in hop_div: for epoch in range(loops): x = np.random.randn(frames * n // div + n - n // div) X = pra.stft(x, n, n // div, transform=np.fft.rfft) y = pra.istft(X, n, n // div, transform=np.fft.irfft) # because of overlap, there is a scaling at reconstruction y[n // div:-n // div] /= div self.assertTrue(np.allclose(x, y))
zp_front=L // 2, zp_back=L // 2) for ch in mics_signals ]) X = np.moveaxis(X, 0, 2) # Run AuxIVA Y = pra.bss.auxiva(X, n_iter=30, proj_back=True, callback=convergence_callback) # run iSTFT y = np.array([ pra.istft(Y[:, :, ch], L, L, transform=np.fft.irfft, zp_front=L // 2, zp_back=L // 2) for ch in range(Y.shape[2]) ]) # Compare SIR ############# sdr, sir, sar, perm = bss_eval_sources(ref[:, :y.shape[1] - L // 2, 0], y[:, L // 2:ref.shape[1] + L // 2]) print('SDR:', sdr) print('SIR:', sir) import matplotlib.pyplot as plt plt.figure() plt.subplot(2, 2, 1)
def test_sparseauxiva(): fs = 16000 signals = [ np.concatenate([ wavfile.read(f)[1].astype(np.float32, order='C') for f in source_files ]) for source_files in wav_files ] wavfile.write('sample1.wav', fs, np.asarray(signals[0], dtype=np.int16)) wavfile.write('sample2.wav', fs, np.asarray(signals[1], dtype=np.int16)) # Define an anechoic room envrionment, as well as the microphone array and source locations. # Room 4m by 6m room_dim = [8, 9] # source locations and delays locations = [[2.5, 3], [2.5, 6]] delays = [1., 0.] # create an anechoic room with sources and mics room = pra.ShoeBox(room_dim, fs=16000, max_order=15, absorption=0.35, sigma2_awgn=1e-8) # add mic and good source to room # Add silent signals to all sources for sig, d, loc in zip(signals, delays, locations): room.add_source(loc, signal=np.zeros_like(sig), delay=d) # add microphone array room.add_microphone_array( pra.MicrophoneArray(np.c_[[6.5, 4.49], [6.5, 4.51]], room.fs)) # Compute the RIRs as in the Room Impulse Response generation section. # compute RIRs room.compute_rir() # Record each source separately separate_recordings = [] for source, signal in zip(room.sources, signals): source.signal[:] = signal room.simulate() separate_recordings.append(room.mic_array.signals) source.signal[:] = 0. separate_recordings = np.array(separate_recordings) # Mix down the recorded signals mics_signals = np.sum(separate_recordings, axis=0) # save mixed signals as wav files wavfile.write('mix1.wav', fs, np.asarray(mics_signals[0].T, dtype=np.int16)) wavfile.write('mix2.wav', fs, np.asarray(mics_signals[1].T, dtype=np.int16)) wavfile.write( 'mix1_norm.wav', fs, np.asarray(mics_signals[0].T / np.max(np.abs(mics_signals[0].T)) * 32767, dtype=np.int16)) wavfile.write( 'mix2_norm.wav', fs, np.asarray(mics_signals[1].T / np.max(np.abs(mics_signals[1].T)) * 32767, dtype=np.int16)) # STFT frame length L = 2048 # START BSS ########### # Preprocessing # Observation vector in the STFT domain X = np.array([ pra.stft(ch, L, L, transform=np.fft.rfft, zp_front=L // 2, zp_back=L // 2) for ch in mics_signals ]) X = np.moveaxis(X, 0, 2) # Reference signal to calculate performance of BSS ref = np.moveaxis(separate_recordings, 1, 2) ratio = 0.35 average = np.abs(np.mean(np.mean(X, axis=2), axis=0)) k = np.int_(average.shape[0] * ratio) S = np.argpartition(average, -k)[-k:] S = np.sort(S) n_iter = 30 # Run SparseAuxIva Y = pra.bss.sparseauxiva(X, S, n_iter, lasso=True) # run iSTFT y = np.array([ pra.istft(Y[:, :, ch], L, L, transform=np.fft.irfft, zp_front=L // 2, zp_back=L // 2) for ch in range(Y.shape[2]) ]) # Compare SIR and SDR with our reference signal sdr, isr, sir, sar, perm = bss_eval_images( ref[:, :y.shape[1] - L // 2, 0], y[:, L // 2:ref.shape[1] + L // 2]) print('SDR: {0}, SIR: {1}'.format(sdr, sir)) wavfile.write('demix1.wav', fs, np.asarray(y[0].T, dtype=np.int16)) wavfile.write('demix2.wav', fs, np.asarray(y[1].T, dtype=np.int16)) wavfile.write( 'demix1_norm.wav', fs, np.asarray(y[0].T / np.max(np.abs(y[0].T)) * 32767, dtype=np.int16)) wavfile.write( 'demix2_norm.wav', fs, np.asarray(y[1].T / np.max(np.abs(y[1].T)) * 32767, dtype=np.int16))
def test_ilrma(): # STFT frame length L = 256 # Room 4m by 6m room_dim = [8, 9] # source location source = np.array([1, 4.5]) # create an anechoic room with sources and mics room = pra.ShoeBox(room_dim, fs=16000, max_order=0, sigma2_awgn=1e-8) # get signals signals = [ np.concatenate( [wavfile.read(f)[1].astype(np.float32) for f in source_files]) for source_files in wav_files ] delays = [1., 0.] locations = [[2.5, 3], [2.5, 6]] # add mic and good source to room # Add silent signals to all sources for sig, d, loc in zip(signals, delays, locations): room.add_source(loc, signal=np.zeros_like(sig), delay=d) # add microphone array room.add_microphone_array( pra.MicrophoneArray(np.c_[[6.5, 4.49], [6.5, 4.51]], fs=room.fs)) # compute RIRs room.compute_rir() # Record each source separately separate_recordings = [] for source, signal in zip(room.sources, signals): source.signal[:] = signal room.simulate() separate_recordings.append(room.mic_array.signals) source.signal[:] = 0. separate_recordings = np.array(separate_recordings) # Mix down the recorded signals mics_signals = np.sum(separate_recordings, axis=0) # START BSS ########### # shape == (n_chan, n_frames, n_freq) X = np.array([ pra.stft(ch, L, L, transform=np.fft.rfft, zp_front=L // 2, zp_back=L // 2) for ch in mics_signals ]) X = np.moveaxis(X, 0, 2) # Run ILRMA Y = pra.bss.ilrma(X, n_iter=30, n_components=30, proj_back=True) # run iSTFT y = np.array([ pra.istft(Y[:, :, ch], L, L, transform=np.fft.irfft, zp_front=L // 2, zp_back=L // 2) for ch in range(Y.shape[2]) ]) # Compare SIR ############# ref = np.moveaxis(separate_recordings, 1, 2) y_aligned = y[:, L // 2:ref.shape[1] + L // 2] mse = np.mean((ref[:, :, 0] - y_aligned)**2) input_variance = np.var(np.concatenate(signals)) print('Relative MSE (expect less than 1e-5):', mse / input_variance) assert (mse / input_variance) < 1e-5
def multinmf_conv_mu_wrapper(x, n_src, n_latent_var, stft_win_len, partial_rirs=None, W_dict=None, n_iter=500, l1_reg=0., random_seed=0, verbose=False): ''' A wrapper around multichannel nmf using MU updates to use with pyroormacoustcs. Performs STFT and ensures all signals are the correct shape. Parameters ---------- x: ndarray (n_samples x n_channel) array of time domain samples n_src: int The number of sources n_latent_var: int The number of latent variables in the NMF stft_win_len: The length of the STFT window partial_rirs: array_like, optional (n_channel x n_src x n_bins) array of partial TF. If provided, Q is not optimized. W_dict: array_like, optional A dictionary of atoms that can be used in the NMF. If provided, W is not optimized. n_iter: int, optional The number of iterations of NMF (default 500) l1_reg: float, optional The weight of the l1 regularization term for the activations (default 0., not regularized) random_seed: unsigned int, optional The seed to provide to the RNG prior to initialization of NMF parameters. This allows to use repeatable initialization. verbose: bool, optional When true, prints convergence info of NMF (default False) ''' n_channel = x.shape[1] # STFT window = np.sqrt(pra.cosine(stft_win_len)) # use sqrt because of synthesis # X is (n_channel, n_frame, n_bin) X = np.array([ pra.stft(x[:, ch], stft_win_len, stft_win_len // 2, win=window, transform=np.fft.rfft) for ch in range(n_channel) ]) # move axes to match Ozerov's order (n_bin, n_frame, n_channel) X = np.moveaxis(X, [0, 1, 2], [2, 1, 0]) n_bin = X.shape[0] n_frame = X.shape[1] # Squared magnitude and unit energy per bin V = np.abs(X)**2 V /= np.mean(V) # Random initialization of multichannel NMF parameters np.random.seed(random_seed) K = n_latent_var * n_src source_NMF_ind = [] for j in range(n_src): source_NMF_ind = np.reshape( np.arange(n_latent_var * n_src, dtype=np.int), (n_src, -1)) mix_psd = np.mean(V, axis=(1, 2)) # W is intialized so that its enegy follows mixture PSD if W_dict is None: W_init = 0.5 * ((np.abs(np.random.randn(n_bin, K)) + np.ones( (n_bin, K))) * (mix_psd[:, np.newaxis] * np.ones((1, K)))) fix_W = False else: if W_dict.shape[1] == n_latent_var: W_init = np.tile(W_dict, n_src) elif W_dict.shape[1] == n_src * n_latent_var: W_init = W_dict else: raise ValueError( 'Mismatch between dictionary size and latent variables') fix_W = True # follow average activations mix_act = np.mean(V, axis=(0, 2)) H_init = 0.5 * (np.abs(np.random.randn(K, n_frame)) + np.ones( (K, n_frame))) * mix_act[np.newaxis, :] if partial_rirs is not None: # squared mag partial rirs (n_bin, n_channel, n_src) Q_init = np.moveaxis(np.abs(partial_rirs)**2, [2], [0]) Q_init /= np.max(Q_init, axis=0)[None, :, :] fix_Q = True else: # random initialization Q_shape = (n_bin, n_channel, n_src) Q_init = (0.5 * (1.9 * np.abs(np.random.randn(*Q_shape)) + 0.1 * np.ones(Q_shape)))**2 fix_Q = False # RUN NMF W_MU, H_MU, Q_MU, cost = \ multinmf_conv_mu( np.abs(X)**2, W_init, H_init, Q_init, source_NMF_ind, n_iter=n_iter, fix_Q=fix_Q, fix_W=fix_W, H_l1_reg=l1_reg, verbose=verbose) # Computation of the spatial source images Im = multinmf_recons_im(X, W_MU, H_MU, Q_MU, source_NMF_ind) sep_sources = [] # Inverse STFT for j in range(n_src): # channel-wise istft with synthesis window ie_MU = [] for ch in range(n_channel): ie_MU.append( pra.istft(Im[:, :, j, ch].T, stft_win_len, stft_win_len // 2, win=window, transform=np.fft.irfft)) sep_sources.append(np.array(ie_MU).T) return np.array(sep_sources)
def test_sparseauxiva(): signals = [np.concatenate([wavfile.read(f)[1].astype(np.float32, order='C') for f in source_files]) for source_files in wav_files] # Define a room environment, as well as the microphone array and source locations. ########### # Room dimensions in meters room_dim = [8, 9] # source locations and delays locations = [[2.5, 3], [2.5, 6]] delays = [1., 0.] # create a room with sources and mics room = pra.ShoeBox(room_dim, fs=16000, max_order=15, absorption=0.35, sigma2_awgn=1e-8) # add mic and good source to room # Add silent signals to all sources for sig, d, loc in zip(signals, delays, locations): room.add_source(loc, signal=np.zeros_like(sig), delay=d) # add microphone array room.add_microphone_array(pra.MicrophoneArray(np.c_[[6.5, 4.49], [6.5, 4.51]], room.fs)) # Compute the RIRs as in the Room Impulse Response generation section. # compute RIRs room.compute_rir() # Record each source separately separate_recordings = [] for source, signal in zip(room.sources, signals): source.signal[:] = signal room.simulate() separate_recordings.append(room.mic_array.signals) source.signal[:] = 0. separate_recordings = np.array(separate_recordings) # Mix down the recorded signals ########### mics_signals = np.sum(separate_recordings, axis=0) # STFT frame length L = 2048 # Observation vector in the STFT domain X = np.array([pra.stft(ch, L, L, transform=np.fft.rfft, zp_front=L // 2, zp_back=L // 2) for ch in mics_signals]) X = np.moveaxis(X, 0, 2) # START BSS ########### # Estimate set of active frequency bins ratio = 0.35 average = np.abs(np.mean(np.mean(X, axis=2), axis=0)) k = np.int_(average.shape[0] * ratio) S = np.sort(np.argpartition(average, -k)[-k:]) # Run SparseAuxIva Y = pra.bss.sparseauxiva(X, S) # run iSTFT y = np.array([pra.istft(Y[:, :, ch], L, L, transform=np.fft.irfft, zp_front=L // 2, zp_back=L // 2) for ch in range(Y.shape[2])]) # Compare SIR ############# ref = np.moveaxis(separate_recordings, 1, 2) y_aligned = y[:,L//2:ref.shape[1]+L//2] mse = np.mean((ref[:,:,0] - y_aligned)**2) input_variance = np.var(np.concatenate(signals)) print('Relative MSE (expect less than 1e-3):', mse / input_variance) assert (mse / input_variance) < 1e-3
def multinmf_conv_em_wrapper( x, n_src, stft_win_len, n_latent_var, n_iter=500, \ A_init=None, W_init=None, H_init=None, \ update_a=True, update_w=True, update_h=True, \ verbose = False): ''' A wrapper around multichannel nmf using EM updates to use with pyroormacoustcs. Performs STFT and ensures all signals are the correct shape. Parameters ---------- x: ndarray (n_samples x n_chan) array of time domain samples n_latent_var: int number of latent variables in the NMF ''' n_chan = x.shape[1] # STFT window = np.sqrt(pra.cosine(stft_win_len)) # use sqrt because of synthesis # X is (n_chan, n_frame, n_bin) X = np.array( [pra.stft(x[:,ch], stft_win_len, stft_win_len // 2, win=window, transform=np.fft.rfft) for ch in range(n_chan)] ) # move axes to match Ozerov's order (n_bin, n_frame, n_chan) X = np.moveaxis(X, [0,1,2], [2,1,0]) n_bin = X.shape[0] n_frame = X.shape[1] if W_init is None: K = n_latent_var * n_src else: K = W_init.shape[-1] # Random initialization of multichannel NMF parameters source_NMF_ind = [] for j in range(n_src): source_NMF_ind = np.reshape(np.arange(K, dtype=np.int), (n_src,-1)) mix_psd = 0.5 * (np.mean(np.sum(np.abs(X)**2, axis=2), axis=1)) if A_init is None: # random initialization update_a = True A_init = (0.5 * ( 1.9 * np.abs(random.randn(n_bin, n_chan, n_src)) \ + 0.1 * np.ones((n_bin, n_chan, n_src)) \ ) * np.sign( random.randn(n_bin, n_chan, n_src) \ + 1j * random.randn(n_bin, n_chan, n_src)) \ ) else: # reshape the partial rir input (n_bin, n_chan, n_src) A_init = np.moveaxis(A_init, [2], [0]) # W is intialized so that its enegy follows mixture PSD if W_init is None: W_init = 0.5 * ( ( np.abs(np.random.randn(n_bin,K)) + np.ones((n_bin,K)) ) * ( mix_psd[:,np.newaxis] * np.ones((1,K)) ) ) if H_init is None: H_init = 0.5 * ( np.abs(np.random.randn(K,n_frame)) + np.ones((K,n_frame)) ) Sigma_b_init = mix_psd / 100 W_EM, H_EM, Ae_EM, Sigma_b_EM, Se_EM, log_like_arr = \ multinmf_conv_em(X, W_init, H_init, A_init, Sigma_b_init, source_NMF_ind, iter_num=n_iter, update_a=update_a, update_w=update_w, update_h=update_h, verbose=verbose) Ae_EM = np.moveaxis(Ae_EM, [0], [2]) # Computation of the spatial source images if verbose: print('Computation of the spatial source images\n') Ie_EM = np.zeros((n_bin,n_frame,n_src,n_chan), dtype=np.complex) for j in range(n_src): for f in range(n_bin): Ie_EM[f,:,j,:] = np.outer(Se_EM[f,:,j], Ae_EM[:,j,f]) sep_sources = [] # Inverse STFT ie_EM = [] for j in range(n_src): # channel-wise istft with synthesis window ie_EM = [] for ch in range(n_chan): ie_EM.append( pra.istft(Ie_EM[:,:,j,ch].T, stft_win_len, stft_win_len // 2, win=window, transform=np.fft.irfft) ) sep_sources.append(np.array(ie_EM).T) return np.array(sep_sources)
def example_usage_multinmf_conv_em(): # # example_usage_multinmf_conv_em() # # Example of usage of EM algorithm for multichannel NMF decomposition in # convolutive mixture # # # input # ----- # # ... # # output # ------ # # estimated source images are written in the results_dir # ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # Copyright 2017 Robin Scheibler, adapted to Python # Copyright 2010 Alexey Ozerov # (alexey.ozerov -at- irisa.fr) # # This software is distributed under the terms of the GNU Public License # version 3 (http://www.gnu.org/licenses/gpl.txt) # # If you use this code please cite this paper # # A. Ozerov and C. Fevotte, # "Multichannel nonnegative matrix factorization in convolutive mixtures for audio source separation," # IEEE Trans. on Audio, Speech and Lang. Proc. special issue on Signal Models and Representations # of Musical and Environmental Sounds, vol. 18, no. 3, pp. 550-563, March 2010. # Available: http://www.irisa.fr/metiss/ozerov/Publications/OzerovFevotte_IEEE_TASLP10.pdf ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% NMF_CompPerSrcNum = 4 nsrc = 3 stft_win_len = 2048 data_dir = 'data/Speech/' results_dir = 'data/Speech/' file_prefix = '3sources_3channels' # Input time-frequency representation print('Input time-frequency representation') fs, x = wavfile.read(data_dir + file_prefix + '_mix.wav') x = x / (2**15) mix_nsamp = x.shape[0] nchan = x.shape[1] # TODO STFT window = pra.cosine(stft_win_len) # X is (nchan, nframe, nbin) X = np.array([ pra.stft(x[:, ch], stft_win_len, stft_win_len // 2, win=window, transform=np.fft.rfft) for ch in range(nchan) ]) # move axes to match Ozerov's order (nbin, nfram, nchan) X = np.moveaxis(X, [0, 1, 2], [2, 1, 0]) nbin = X.shape[0] nfram = X.shape[1] # Random initialization of multichannel NMF parameters print('Random initialization of multichannel NMF parameters') K = NMF_CompPerSrcNum * nsrc source_NMF_ind = [] for j in range(nsrc): source_NMF_ind.append( np.arange(NMF_CompPerSrcNum) + j * NMF_CompPerSrcNum) mix_psd = 0.5 * (np.mean(np.abs(np.sum(X**2, axis=2)), axis=1)) random_phases = random.randn(nchan, nsrc, nbin) + 1j * random.randn(nchan, nsrc, nbin) random_phases /= np.abs(random_phases) A_init = (0.5 * (1.9 * np.abs(random.randn(nchan, nsrc, nbin)) + 0.1 * np.ones( (nchan, nsrc, nbin))) * random_phases) # W is intialized so that its enegy follows mixture PSD W_init = 0.5 * ((np.abs(random.randn(nbin, K)) + np.ones( (nbin, K))) * (mix_psd[:, np.newaxis] * np.ones((1, K)))) # W_init = np.load("W_dictionary_em.npy") # print(W_init.shape) # K = W_init.shape[1] H_init = 0.5 * (np.abs(random.randn(K, nfram)) + np.ones((K, nfram))) Sigma_b_init = mix_psd / 100 # run 500 iterations of multichannel NMF EM algorithm (with annealing) A_init = np.moveaxis(A_init, [2], [0]) W_EM, H_EM, Ae_EM, Sigma_b_EM, Se_EM, log_like_arr = \ multinmf_conv_em(X, W_init, H_init, A_init, Sigma_b_init, source_NMF_ind, iter_num=300) Ae_EM = np.moveaxis(Ae_EM, [0], [2]) # Computation of the spatial source images print('Computation of the spatial source images\n') Ie_EM = np.zeros((nbin, nfram, nsrc, nchan), dtype=np.complex) for j in range(nsrc): for f in range(nbin): Ie_EM[f, :, j, :] = np.outer(Se_EM[f, :, j], Ae_EM[:, j, f]) # Inverse STFT ie_EM = [] for j in range(nsrc): # channel-wise istft with synthesis window ie_EM = [] for ch in range(nchan): ie_EM.append( pra.istft(Ie_EM[:, :, j, ch].T, stft_win_len, stft_win_len // 2, win=window, transform=np.fft.irfft)) # write the separated source to a wav file out_filename = results_dir + '_sim_EM_' + str(j) + '.wav' wavfile.write(out_filename, fs, np.array(ie_EM).T) # Plot estimated W and H print('Plot estimated W and H') plt.figure() plot_ind = 1 for k in range(NMF_CompPerSrcNum): for j in range(nsrc): plt.subplot(NMF_CompPerSrcNum, nsrc, plot_ind) plt.plot(np.log10(np.maximum(W_EM[:, source_NMF_ind[j][k]], 1e-40))) plt.title('Source_{}, log10(W_{})'.format(j, k)) plot_ind += 1 plt.tight_layout() plt.figure() plot_ind = 1 for k in range(NMF_CompPerSrcNum): for j in range(nsrc): plt.subplot(NMF_CompPerSrcNum, nsrc, plot_ind) plt.plot(H_EM[source_NMF_ind[j][k], :]) plt.title('Source_{}, H_{}'.format(j, k)) plot_ind = plot_ind + 1 plt.tight_layout() plt.show() plt.figure() plt.plot(log_like_arr) plt.show()
print() print("----- MULTIPLE FRAMES AT A TIME -----") print("One shot function : ", end="") start = time.time() for k in range(num_times): y_mic_stft = np.array([ pra.stft(signals[:, k], block_size, hop, transform=np.fft.rfft, win=win).T for k in range(num_mic) ]) x_r = np.array([ pra.istft(y_mic_stft[k, :, :].T, block_size, hop, transform=np.fft.irfft) for k in range(num_mic) ]) avg_time = (time.time() - start) / num_times print("%0.3f sec" % avg_time) err_dB = 20 * np.log10( np.max( np.abs(signals[hop:x_r.shape[1] - hop, ] - x_r.T[hop:x_r.shape[1] - hop, ]))) print("Error [dB] : %0.3f" % err_dB) warnings.filterwarnings( "ignore") # to avoid warning of appending zeros to be printed print("With STFT object (not fixed) : ", end="") stft = STFT(block_size, hop=hop,