def multinmf_conv_mu_wrapper(x, n_src, n_latent_var, stft_win_len, partial_rirs=None, W_dict=None, n_iter=500, l1_reg=0., random_seed=0, verbose=False): ''' A wrapper around multichannel nmf using MU updates to use with pyroormacoustcs. Performs STFT and ensures all signals are the correct shape. Parameters ---------- x: ndarray (n_samples x n_channel) array of time domain samples n_src: int The number of sources n_latent_var: int The number of latent variables in the NMF stft_win_len: The length of the STFT window partial_rirs: array_like, optional (n_channel x n_src x n_bins) array of partial TF. If provided, Q is not optimized. W_dict: array_like, optional A dictionary of atoms that can be used in the NMF. If provided, W is not optimized. n_iter: int, optional The number of iterations of NMF (default 500) l1_reg: float, optional The weight of the l1 regularization term for the activations (default 0., not regularized) random_seed: unsigned int, optional The seed to provide to the RNG prior to initialization of NMF parameters. This allows to use repeatable initialization. verbose: bool, optional When true, prints convergence info of NMF (default False) ''' n_channel = x.shape[1] # STFT window = np.sqrt(pra.cosine(stft_win_len)) # use sqrt because of synthesis # X is (n_channel, n_frame, n_bin) X = np.array([ pra.stft(x[:, ch], stft_win_len, stft_win_len // 2, win=window, transform=np.fft.rfft) for ch in range(n_channel) ]) # move axes to match Ozerov's order (n_bin, n_frame, n_channel) X = np.moveaxis(X, [0, 1, 2], [2, 1, 0]) n_bin = X.shape[0] n_frame = X.shape[1] # Squared magnitude and unit energy per bin V = np.abs(X)**2 V /= np.mean(V) # Random initialization of multichannel NMF parameters np.random.seed(random_seed) K = n_latent_var * n_src source_NMF_ind = [] for j in range(n_src): source_NMF_ind = np.reshape( np.arange(n_latent_var * n_src, dtype=np.int), (n_src, -1)) mix_psd = np.mean(V, axis=(1, 2)) # W is intialized so that its enegy follows mixture PSD if W_dict is None: W_init = 0.5 * ((np.abs(np.random.randn(n_bin, K)) + np.ones( (n_bin, K))) * (mix_psd[:, np.newaxis] * np.ones((1, K)))) fix_W = False else: if W_dict.shape[1] == n_latent_var: W_init = np.tile(W_dict, n_src) elif W_dict.shape[1] == n_src * n_latent_var: W_init = W_dict else: raise ValueError( 'Mismatch between dictionary size and latent variables') fix_W = True # follow average activations mix_act = np.mean(V, axis=(0, 2)) H_init = 0.5 * (np.abs(np.random.randn(K, n_frame)) + np.ones( (K, n_frame))) * mix_act[np.newaxis, :] if partial_rirs is not None: # squared mag partial rirs (n_bin, n_channel, n_src) Q_init = np.moveaxis(np.abs(partial_rirs)**2, [2], [0]) Q_init /= np.max(Q_init, axis=0)[None, :, :] fix_Q = True else: # random initialization Q_shape = (n_bin, n_channel, n_src) Q_init = (0.5 * (1.9 * np.abs(np.random.randn(*Q_shape)) + 0.1 * np.ones(Q_shape)))**2 fix_Q = False # RUN NMF W_MU, H_MU, Q_MU, cost = \ multinmf_conv_mu( np.abs(X)**2, W_init, H_init, Q_init, source_NMF_ind, n_iter=n_iter, fix_Q=fix_Q, fix_W=fix_W, H_l1_reg=l1_reg, verbose=verbose) # Computation of the spatial source images Im = multinmf_recons_im(X, W_MU, H_MU, Q_MU, source_NMF_ind) sep_sources = [] # Inverse STFT for j in range(n_src): # channel-wise istft with synthesis window ie_MU = [] for ch in range(n_channel): ie_MU.append( pra.istft(Im[:, :, j, ch].T, stft_win_len, stft_win_len // 2, win=window, transform=np.fft.irfft)) sep_sources.append(np.array(ie_MU).T) return np.array(sep_sources)
def multinmf_conv_em_wrapper( x, n_src, stft_win_len, n_latent_var, n_iter=500, \ A_init=None, W_init=None, H_init=None, \ update_a=True, update_w=True, update_h=True, \ verbose = False): ''' A wrapper around multichannel nmf using EM updates to use with pyroormacoustcs. Performs STFT and ensures all signals are the correct shape. Parameters ---------- x: ndarray (n_samples x n_chan) array of time domain samples n_latent_var: int number of latent variables in the NMF ''' n_chan = x.shape[1] # STFT window = np.sqrt(pra.cosine(stft_win_len)) # use sqrt because of synthesis # X is (n_chan, n_frame, n_bin) X = np.array( [pra.stft(x[:,ch], stft_win_len, stft_win_len // 2, win=window, transform=np.fft.rfft) for ch in range(n_chan)] ) # move axes to match Ozerov's order (n_bin, n_frame, n_chan) X = np.moveaxis(X, [0,1,2], [2,1,0]) n_bin = X.shape[0] n_frame = X.shape[1] if W_init is None: K = n_latent_var * n_src else: K = W_init.shape[-1] # Random initialization of multichannel NMF parameters source_NMF_ind = [] for j in range(n_src): source_NMF_ind = np.reshape(np.arange(K, dtype=np.int), (n_src,-1)) mix_psd = 0.5 * (np.mean(np.sum(np.abs(X)**2, axis=2), axis=1)) if A_init is None: # random initialization update_a = True A_init = (0.5 * ( 1.9 * np.abs(random.randn(n_bin, n_chan, n_src)) \ + 0.1 * np.ones((n_bin, n_chan, n_src)) \ ) * np.sign( random.randn(n_bin, n_chan, n_src) \ + 1j * random.randn(n_bin, n_chan, n_src)) \ ) else: # reshape the partial rir input (n_bin, n_chan, n_src) A_init = np.moveaxis(A_init, [2], [0]) # W is intialized so that its enegy follows mixture PSD if W_init is None: W_init = 0.5 * ( ( np.abs(np.random.randn(n_bin,K)) + np.ones((n_bin,K)) ) * ( mix_psd[:,np.newaxis] * np.ones((1,K)) ) ) if H_init is None: H_init = 0.5 * ( np.abs(np.random.randn(K,n_frame)) + np.ones((K,n_frame)) ) Sigma_b_init = mix_psd / 100 W_EM, H_EM, Ae_EM, Sigma_b_EM, Se_EM, log_like_arr = \ multinmf_conv_em(X, W_init, H_init, A_init, Sigma_b_init, source_NMF_ind, iter_num=n_iter, update_a=update_a, update_w=update_w, update_h=update_h, verbose=verbose) Ae_EM = np.moveaxis(Ae_EM, [0], [2]) # Computation of the spatial source images if verbose: print('Computation of the spatial source images\n') Ie_EM = np.zeros((n_bin,n_frame,n_src,n_chan), dtype=np.complex) for j in range(n_src): for f in range(n_bin): Ie_EM[f,:,j,:] = np.outer(Se_EM[f,:,j], Ae_EM[:,j,f]) sep_sources = [] # Inverse STFT ie_EM = [] for j in range(n_src): # channel-wise istft with synthesis window ie_EM = [] for ch in range(n_chan): ie_EM.append( pra.istft(Ie_EM[:,:,j,ch].T, stft_win_len, stft_win_len // 2, win=window, transform=np.fft.irfft) ) sep_sources.append(np.array(ie_EM).T) return np.array(sep_sources)
with open(cache_file, 'wb') as f: pickle.dump(corpus, f) # let's find all the sentences from male speakers in the training set male_speakers_test = list(set([s.speaker for s in filter(lambda x: x.sex == 'M', corpus.sentence_corpus['TEST'])])) male_speakers_train = list(set([s.speaker for s in filter(lambda x: x.sex == 'M', corpus.sentence_corpus['TRAIN'])])) female_speakers_test = list(set([s.speaker for s in filter(lambda x: x.sex == 'F', corpus.sentence_corpus['TEST'])])) female_speakers_train = list(set([s.speaker for s in filter(lambda x: x.sex == 'F', corpus.sentence_corpus['TRAIN'])])) print('Pick a subset of', n_speakers, 'speakers') training_set_speakers = male_speakers_train[:n_speakers] + female_speakers_train[:n_speakers] print(training_set_speakers) # compute all the spectrograms print('Compute all the spectrograms') window = np.sqrt(pra.cosine(stft_win_len)) # use sqrt because of synthesis training_set = dict() testing_set = dict() for speaker in training_set_speakers: training_set_sentences = filter(lambda x: x.speaker == speaker, corpus.sentence_corpus['TRAIN']) # X is (n_sentences, n_channel, n_frame) x = list() X = list() for sentence in training_set_sentences: print(sentence.speaker, sentence.id,) x.append(sentence.samples) X.append(pra.stft(sentence.samples, stft_win_len, stft_win_len // 2, win=window, transform=np.fft.rfft).T) # TRAIN: # Dalia says the magnitude works better... training_set[speaker] = np.concatenate([np.abs(spectrogram)**2 for spectrogram in X[0:9]], axis=1) # TEST:
def example_usage_multinmf_conv_em(): # # example_usage_multinmf_conv_em() # # Example of usage of EM algorithm for multichannel NMF decomposition in # convolutive mixture # # # input # ----- # # ... # # output # ------ # # estimated source images are written in the results_dir # ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # Copyright 2017 Robin Scheibler, adapted to Python # Copyright 2010 Alexey Ozerov # (alexey.ozerov -at- irisa.fr) # # This software is distributed under the terms of the GNU Public License # version 3 (http://www.gnu.org/licenses/gpl.txt) # # If you use this code please cite this paper # # A. Ozerov and C. Fevotte, # "Multichannel nonnegative matrix factorization in convolutive mixtures for audio source separation," # IEEE Trans. on Audio, Speech and Lang. Proc. special issue on Signal Models and Representations # of Musical and Environmental Sounds, vol. 18, no. 3, pp. 550-563, March 2010. # Available: http://www.irisa.fr/metiss/ozerov/Publications/OzerovFevotte_IEEE_TASLP10.pdf ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% NMF_CompPerSrcNum = 4 nsrc = 3 stft_win_len = 2048 data_dir = 'data/Speech/' results_dir = 'data/Speech/' file_prefix = '3sources_3channels' # Input time-frequency representation print('Input time-frequency representation') fs, x = wavfile.read(data_dir + file_prefix + '_mix.wav') x = x / (2**15) mix_nsamp = x.shape[0] nchan = x.shape[1] # TODO STFT window = pra.cosine(stft_win_len) # X is (nchan, nframe, nbin) X = np.array([ pra.stft(x[:, ch], stft_win_len, stft_win_len // 2, win=window, transform=np.fft.rfft) for ch in range(nchan) ]) # move axes to match Ozerov's order (nbin, nfram, nchan) X = np.moveaxis(X, [0, 1, 2], [2, 1, 0]) nbin = X.shape[0] nfram = X.shape[1] # Random initialization of multichannel NMF parameters print('Random initialization of multichannel NMF parameters') K = NMF_CompPerSrcNum * nsrc source_NMF_ind = [] for j in range(nsrc): source_NMF_ind.append( np.arange(NMF_CompPerSrcNum) + j * NMF_CompPerSrcNum) mix_psd = 0.5 * (np.mean(np.abs(np.sum(X**2, axis=2)), axis=1)) random_phases = random.randn(nchan, nsrc, nbin) + 1j * random.randn(nchan, nsrc, nbin) random_phases /= np.abs(random_phases) A_init = (0.5 * (1.9 * np.abs(random.randn(nchan, nsrc, nbin)) + 0.1 * np.ones( (nchan, nsrc, nbin))) * random_phases) # W is intialized so that its enegy follows mixture PSD W_init = 0.5 * ((np.abs(random.randn(nbin, K)) + np.ones( (nbin, K))) * (mix_psd[:, np.newaxis] * np.ones((1, K)))) # W_init = np.load("W_dictionary_em.npy") # print(W_init.shape) # K = W_init.shape[1] H_init = 0.5 * (np.abs(random.randn(K, nfram)) + np.ones((K, nfram))) Sigma_b_init = mix_psd / 100 # run 500 iterations of multichannel NMF EM algorithm (with annealing) A_init = np.moveaxis(A_init, [2], [0]) W_EM, H_EM, Ae_EM, Sigma_b_EM, Se_EM, log_like_arr = \ multinmf_conv_em(X, W_init, H_init, A_init, Sigma_b_init, source_NMF_ind, iter_num=300) Ae_EM = np.moveaxis(Ae_EM, [0], [2]) # Computation of the spatial source images print('Computation of the spatial source images\n') Ie_EM = np.zeros((nbin, nfram, nsrc, nchan), dtype=np.complex) for j in range(nsrc): for f in range(nbin): Ie_EM[f, :, j, :] = np.outer(Se_EM[f, :, j], Ae_EM[:, j, f]) # Inverse STFT ie_EM = [] for j in range(nsrc): # channel-wise istft with synthesis window ie_EM = [] for ch in range(nchan): ie_EM.append( pra.istft(Ie_EM[:, :, j, ch].T, stft_win_len, stft_win_len // 2, win=window, transform=np.fft.irfft)) # write the separated source to a wav file out_filename = results_dir + '_sim_EM_' + str(j) + '.wav' wavfile.write(out_filename, fs, np.array(ie_EM).T) # Plot estimated W and H print('Plot estimated W and H') plt.figure() plot_ind = 1 for k in range(NMF_CompPerSrcNum): for j in range(nsrc): plt.subplot(NMF_CompPerSrcNum, nsrc, plot_ind) plt.plot(np.log10(np.maximum(W_EM[:, source_NMF_ind[j][k]], 1e-40))) plt.title('Source_{}, log10(W_{})'.format(j, k)) plot_ind += 1 plt.tight_layout() plt.figure() plot_ind = 1 for k in range(NMF_CompPerSrcNum): for j in range(nsrc): plt.subplot(NMF_CompPerSrcNum, nsrc, plot_ind) plt.plot(H_EM[source_NMF_ind[j][k], :]) plt.title('Source_{}, H_{}'.format(j, k)) plot_ind = plot_ind + 1 plt.tight_layout() plt.show() plt.figure() plt.plot(log_like_arr) plt.show()