def gen_speech_at_mic_stft(phi_ks, source_signals, mic_array_coord, noise_power, fs, fft_size=1024): """ generate microphone signals with short time Fourier transform :param phi_ks: azimuth of the acoustic sources :param source_signals: speech signals for each arrival angle, one per row :param mic_array_coord: x and y coordinates of the microphone array :param noise_power: the variance of the microphone noise signal :param fs: sampling frequency :param fft_size: number of FFT bins :return: y_hat_stft: received (complex) signal at microphones y_hat_stft_noiseless: the noiseless received (complex) signal at microphones """ frame_shift_step = np.int(fft_size / 1.) # half block overlap for adjacent frames K = source_signals.shape[0] # number of point sources num_mic = mic_array_coord.shape[1] # number of microphones # Generate the impulse responses for the array and source directions impulse_response = gen_far_field_ir(np.reshape(phi_ks, (1, -1), order='F'), mic_array_coord, fs) # Now generate all the microphone signals y = np.zeros( (num_mic, source_signals.shape[1] + impulse_response.shape[2] - 1), dtype=np.float32) for src in xrange(K): for mic in xrange(num_mic): y[mic] += fftconvolve(impulse_response[src, mic], source_signals[src]) # Now do the short time Fourier transform # The resulting signal is M x fft_size/2+1 x number of frames y_hat_stft_noiseless = \ np.array([pra.stft(signal, fft_size, frame_shift_step, transform=mkl_fft.rfft).T for signal in y]) / np.sqrt(fft_size) # Add noise to the signals y_noisy = y + np.sqrt(noise_power) * np.array(np.random.randn(*y.shape), dtype=np.float32) # compute sources stft source_stft = \ np.array([pra.stft(s_loop, fft_size, frame_shift_step, transform=mkl_fft.rfft).T for s_loop in source_signals]) / np.sqrt(fft_size) y_hat_stft = \ np.array([pra.stft(signal, fft_size, frame_shift_step, transform=mkl_fft.rfft).T for signal in y_noisy]) / np.sqrt(fft_size) return y_hat_stft, y_hat_stft_noiseless, source_stft
def feature_Vector_testPos(index_src, D=256): ''' This function creates the feature vector for given test position using the microphones in the room ''' overlap = 1 rir0 = room_test.rir[0][index_src] rir1 = room_test.rir[1][index_src] len0, len1 = rir0.shape[0], rir1.shape[0] atf_0 = pra.stft(rir0, L=D, hop=int(len0 * overlap), win=pra.windows.hann(N=256)) atf_1 = pra.stft(rir1, L=D, hop=int(len1 * overlap), win=pra.windows.hann(N=256)) return atf_0 / atf_1
def matrix_doa(): global source_signal #print(source_signal) #algo_names = ['SRP', 'MUSIC', 'TOPS', 'CSSM', 'WAVES'] algo_name = 'SRP' #print('The algorithms {} will be used.'.format(algo_name)) nfft = 256 # FFT size ################################ # Compute the STFT frames needed X = np.array([ pra.stft(source_signal[:, i], nfft, nfft // 2, transform=np.fft.rfft).T for i in range(CHANNELS) ]) ############################################## # Construct the new DOA object # the max_four parameter is necessary for FRIDA only doa = pra.doa.algorithms[algo_name](R, fs, nfft, c=c) # this call here perform localization on the frames in X doa.locate_sources(X, freq_range=[1000, 3000]) # doa.azimuth_recon contains the reconstructed location of the source angle = doa.azimuth_recon / np.pi * 180 print(' Recovered azimuth:', angle, 'degrees') return (angle)
def run_doa(angle, h, algo, doa_kwargs, freq_bins, speakers_numbering): ''' Run the doa localization for one source location and one algorithm ''' # Prepare the DOA localizer object algo_key = doa_kwargs['algo_obj'] doa = pra.doa.algorithms[algo_key](mic_array, fs, nfft, c=c, num_src=1, dim=3, **doa_kwargs) # get the loudspeaker index from its name spkr = speakers_numbering[h] # open the recording file filename = fn.format(name=sample_name, spkr=spkr, angle=angle) fs_data, data = wavfile.read(filename) if fs_data != fs: raise ValueError('Sampling frequency mismatch') # do time-freq decomposition X = np.array([ pra.stft(signal, nfft, stft_hop, transform=np.fft.rfft).T for signal in data.T ]) # run doa doa.locate_sources(X, freq_bins=freq_bins) col = float(doa.colatitude_recon[0]) az = float(doa.azimuth_recon[0]) # manual calibration groundtruth col_gt_man = locations['speakers_manual_colatitude'][h] az_gt_man = np.radians(int(angle)) error_man = pra.doa.great_circ_dist(1., col, az, col_gt_man, az_gt_man) # optimized calibration groundtruth col_gt_opt = locations['sources'][h]['colatitude'][angle] az_gt_opt = locations['sources'][h]['azimuth'][angle] error_opt = pra.doa.great_circ_dist(1., col, az, col_gt_opt, az_gt_opt) print(algo, h, angle, ': Err Man=', error_man, 'Opt=', error_opt) return { 'algo': algo, 'angle': angle, 'spkr_height': h, 'loc_man': (col_gt_man, az_gt_man), 'loc_opt': (col_gt_opt, az_gt_opt), 'loc_doa': (col, az), 'error_man': float(error_man), 'error_opt': float(error_opt), }
def difference_of_arrivals(speed_of_sound, signal_list, algorithm_name, num_sources, *mic_location): """Gets an azimuth and co-latitude for each pair of microphones. Args: speed_of_sound: specific speed of sound signal_list: the microphone signals algorithm_name: specific distance of arrival (DOA) method num_sources: number of sources to find mic_location: location of each microphone Returns: doa.azimuth_recon: Azimuth angle doa.colatitude_recon: Co-latitude angle """ # Constants fs = 16000 # sampling frequency nfft = 256 # FFT size # Add n-microphone array in [x,y,z] order m = np.vstack(list(zip(*mic_location))) # Create an array of a short fourier transformed frequency signal x = np.array([ pra.stft(signal, nfft, nfft // 2, transform=np.fft.rfft).T for signal in signal_list ]) # Frequency Range freq_range = [0, 250] # Construct the new DOA object doa = pra.doa.algorithms[algorithm_name]( L=m, fs=fs, nfft=nfft, c=speed_of_sound, num_src=num_sources, max_four=4, dim=3, azimuth=np.linspace(-180., 180., 360) * np.pi / 180, colatitude=np.linspace(-90., 90., 180) * np.pi / 180) # Locate the sources doa.locate_sources(x, freq_range=freq_range) # Return all in radians return doa.azimuth_recon, doa.colatitude_recon
def test_stft_nowindow(self): frames = 100 fftsize = [128, 256, 512] hop_div = [1, 2] loops = 10 for n in fftsize: for div in hop_div: for epoch in range(loops): x = np.random.randn(frames * n // div + n - n // div) X = pra.stft(x, n, n // div, transform=np.fft.rfft) y = pra.istft(X, n, n // div, transform=np.fft.irfft) # because of overlap, there is a scaling at reconstruction y[n // div:-n // div] /= div self.assertTrue(np.allclose(x, y))
def get_difference_of_arrivals(self, signal_list, *mic_location): """Returns an azimuth and co-latitude for each pair of microphones combinations. Note: all angles are returned in radians Args: signal_list: (list) microphones signals *mic_location: (list) location of each microphone Returns: doa.azimuth_recon: (float) Azimuth angle doa.colatitude_recon: (float) Co-latitude angle """ # Add n-microphone array in [x,y,z] order m = np.vstack(list(zip(*mic_location))) # TODO: Figure out this deprecation # Create an array of a short fourier transformed frequency signal if self.transform: x = np.array([ pra.stft(signal, self.fft_size, self.fft_size // 2, transform=np.fft.rfft).T for signal in signal_list ]) else: x = np.array([ pra.transform.stft.analysis(signal, self.fft_size, self.fft_size // 2).T for signal in signal_list ]) # Construct the new DOA object doa = pra.doa.algorithms.get(self.algo_name)( L=m, fs=self.fs, nfft=self.fft_size, c=self.sound_speed, num_src=self.num_sources, max_four=4, dim=3, azimuth=np.linspace(-180., 180., 360) * np.pi / 180, colatitude=np.linspace(-90., 90., 180) * np.pi / 180) doa.locate_sources(x, freq_range=self.freq_range) return doa.azimuth_recon, doa.colatitude_recon
def plot(self, L=512, hop=128, zpb=0, phonems=False, **kwargs): try: import matplotlib.pyplot as plt import seaborn as sns except ImportError: return sns.set_style("white") X = stft( self.data, L=L, hop=hop, zp_back=zpb, transform=np.fft.rfft, win=np.hanning(L + zpb), ) X = 10 * np.log10(np.abs(X)**2).T plt.imshow(X, origin="lower", aspect="auto") ticks = [] ticklabels = [] if phonems: for phonem in self.phonems: plt.axvline(x=phonem["bnd"][0] / hop) plt.axvline(x=phonem["bnd"][1] / hop) ticks.append((phonem["bnd"][1] + phonem["bnd"][0]) / 2 / hop) ticklabels.append(phonem["name"]) else: for word in self.words: plt.axvline(x=word.boundaries[0] / hop) plt.axvline(x=word.boundaries[1] / hop) ticks.append( (word.boundaries[1] + word.boundaries[0]) / 2 / hop) ticklabels.append(word.word) plt.xticks(ticks, ticklabels, rotation=-45) plt.yticks([], []) plt.tick_params(axis="both", which="major", labelsize=14)
def difference_of_arrivals(self, signal_list, *mic_location): """Returns an azimuth and co-latitude for each pair of microphones combinations. Note: all angles are returned in radians Args: signal_list: (list) microphones signals *mic_location: (list) location of each microphone Returns: doa.azimuth_recon: (float) Azimuth angle doa.colatitude_recon: (float) Co-latitude angle Raises: ValueError: Signal list is empty ValueError: None in Signal list ValueError: Microphone list is empty ValueError: None in microphone list """ print(type(signal_list)) print(signal_list) if not signal_list: raise ValueError('Error. Signal list is empty.') if np.array(signal_list).shape[0] == 1 and None in signal_list: raise ValueError('Error. None in signal list.') # This works for lists of lists, but not for single list if any([True for signal in signal_list if None in signal]): raise ValueError('Error. None in signal list.') if not mic_location: raise ValueError('Error. Microphone location list is empty.') if None in mic_location: raise ValueError( 'Error. None in microphone location list is empty.') # Add n-microphone array in [x,y,z] order m = np.vstack(list(zip(*mic_location))) # TODO: Figure out this deprecation # Create an array of a short fourier transformed frequency signal if self.transform: x = np.array([ pra.stft(signal, self.fft_size, self.fft_size // 2, transform=np.fft.rfft).T for signal in signal_list ]) else: x = np.array([ pra.transform.stft.analysis(signal, self.fft_size, self.fft_size // 2).T for signal in signal_list ]) # Construct the new DOA object doa = pra.doa.algorithms.get(self.algo_name)( L=m, fs=self.fs, nfft=self.fft_size, c=self.sound_speed, num_src=self.num_sources, max_four=4, dim=3, azimuth=np.linspace(-180., 180., 360) * np.pi / 180, colatitude=np.linspace(-90., 90., 180) * np.pi / 180) doa.locate_sources(x, freq_range=self.freq_range) return doa.azimuth_recon, doa.colatitude_recon
def test_sparseauxiva(): fs = 16000 signals = [ np.concatenate([ wavfile.read(f)[1].astype(np.float32, order='C') for f in source_files ]) for source_files in wav_files ] wavfile.write('sample1.wav', fs, np.asarray(signals[0], dtype=np.int16)) wavfile.write('sample2.wav', fs, np.asarray(signals[1], dtype=np.int16)) # Define an anechoic room envrionment, as well as the microphone array and source locations. # Room 4m by 6m room_dim = [8, 9] # source locations and delays locations = [[2.5, 3], [2.5, 6]] delays = [1., 0.] # create an anechoic room with sources and mics room = pra.ShoeBox(room_dim, fs=16000, max_order=15, absorption=0.35, sigma2_awgn=1e-8) # add mic and good source to room # Add silent signals to all sources for sig, d, loc in zip(signals, delays, locations): room.add_source(loc, signal=np.zeros_like(sig), delay=d) # add microphone array room.add_microphone_array( pra.MicrophoneArray(np.c_[[6.5, 4.49], [6.5, 4.51]], room.fs)) # Compute the RIRs as in the Room Impulse Response generation section. # compute RIRs room.compute_rir() # Record each source separately separate_recordings = [] for source, signal in zip(room.sources, signals): source.signal[:] = signal room.simulate() separate_recordings.append(room.mic_array.signals) source.signal[:] = 0. separate_recordings = np.array(separate_recordings) # Mix down the recorded signals mics_signals = np.sum(separate_recordings, axis=0) # save mixed signals as wav files wavfile.write('mix1.wav', fs, np.asarray(mics_signals[0].T, dtype=np.int16)) wavfile.write('mix2.wav', fs, np.asarray(mics_signals[1].T, dtype=np.int16)) wavfile.write( 'mix1_norm.wav', fs, np.asarray(mics_signals[0].T / np.max(np.abs(mics_signals[0].T)) * 32767, dtype=np.int16)) wavfile.write( 'mix2_norm.wav', fs, np.asarray(mics_signals[1].T / np.max(np.abs(mics_signals[1].T)) * 32767, dtype=np.int16)) # STFT frame length L = 2048 # START BSS ########### # Preprocessing # Observation vector in the STFT domain X = np.array([ pra.stft(ch, L, L, transform=np.fft.rfft, zp_front=L // 2, zp_back=L // 2) for ch in mics_signals ]) X = np.moveaxis(X, 0, 2) # Reference signal to calculate performance of BSS ref = np.moveaxis(separate_recordings, 1, 2) ratio = 0.35 average = np.abs(np.mean(np.mean(X, axis=2), axis=0)) k = np.int_(average.shape[0] * ratio) S = np.argpartition(average, -k)[-k:] S = np.sort(S) n_iter = 30 # Run SparseAuxIva Y = pra.bss.sparseauxiva(X, S, n_iter, lasso=True) # run iSTFT y = np.array([ pra.istft(Y[:, :, ch], L, L, transform=np.fft.irfft, zp_front=L // 2, zp_back=L // 2) for ch in range(Y.shape[2]) ]) # Compare SIR and SDR with our reference signal sdr, isr, sir, sar, perm = bss_eval_images( ref[:, :y.shape[1] - L // 2, 0], y[:, L // 2:ref.shape[1] + L // 2]) print('SDR: {0}, SIR: {1}'.format(sdr, sir)) wavfile.write('demix1.wav', fs, np.asarray(y[0].T, dtype=np.int16)) wavfile.write('demix2.wav', fs, np.asarray(y[1].T, dtype=np.int16)) wavfile.write( 'demix1_norm.wav', fs, np.asarray(y[0].T / np.max(np.abs(y[0].T)) * 32767, dtype=np.int16)) wavfile.write( 'demix2_norm.wav', fs, np.asarray(y[1].T / np.max(np.abs(y[1].T)) * 32767, dtype=np.int16))
def srp_phat(s, fs, nFFT=None, center=None, d=None, azimuth_estm=None, mode=None): ''' Applies Steered Power Response with phase transform algorithm Uses pyroomacoustics module Input params ------------ s: numpy array -stacked microphone array signals (NOTE: number of microphones is extracted from the size of input signal, since the input signal will be of size MxN where M is number of microphones and N is the length of the audio signal.) fs: int -Sampling frequency nfft: int -FFT size. Default 1024 center: numpy array -Defines the center of the room. Default [0,0] d: int -Distance between microphones. Default 10cm. azimuth_estm: numpy array -Candidate azimuth estimates, representing location estimates of speakers. Default expects microphone to be in the middle of a table and speakers located around it. Assumes two speakers - [60,120] mode: str -Defines the microphone setup layout. Default mode = linear. mode = linear mode = circular ''' if nFFT is None: nFFT = 1024 if center is None: center = [0, 0] if d is None: d = 0.1 if azimuth_estm is None: azimuth_estm = [60, 120] freq_bins = np.arange( 30, 330) #list of individual frequency bins used to run DoA M = s.shape[0] #number of microphones phi = 0 #assume angle between microphones is 0 (same y-axis) radius = d * M / (2 * np.pi) #define radius for circular microphone layout c = 343.0 #speed of sound #Define Microphone array layout if mode is 'circular': L = pra.circular_2D_array(center, M, phi, radius) if mode is None or 'linear': L = pra.linear_2D_array(center, M, phi, d) nSrc = len(azimuth_estm) #number of speakers #STFT s_FFT = np.array([ pra.stft(s, nFFT, nFFT // 2, transform=np.fft.rfft).T for s in s ]) #STFT for s1 and s2 #SRP doa = pra.doa.srp.SRP(L, fs, nFFT, c, max_four=4, num_src=nSrc) #perform SRP approximation #Apply SRP-PHAT doa.locate_sources(s_FFT, freq_bins=freq_bins) #PLOTTING doa.polar_plt_dirac() plt.title('SRP-PHAT') print('SRP-PHAT') print('Speakers at: ', np.sort(doa.azimuth_recon) / np.pi * 180, 'degrees') plt.show()
def test_bss(algo, L): # Room dimensions in meters room_dim = [8, 9] # create a room with sources and mics room = pra.ShoeBox(room_dim, fs=16000, max_order=0, sigma2_awgn=1e-8) # get signals signals = [ np.concatenate( [wavfile.read(f)[1].astype(np.float32) for f in source_files]) for source_files in wav_files ] delays = [1., 0.] locations = [[2.5, 3], [2.5, 6]] # add mic and good source to room # Add silent signals to all sources for sig, d, loc in zip(signals, delays, locations): room.add_source(loc, signal=np.zeros_like(sig), delay=d) # add microphone array room.add_microphone_array( pra.MicrophoneArray(np.c_[[6.5, 4.49], [6.5, 4.51]], fs=room.fs)) # compute RIRs room.compute_rir() # Record each source separately separate_recordings = [] for source, signal in zip(room.sources, signals): source.signal[:] = signal room.simulate() separate_recordings.append(room.mic_array.signals) source.signal[:] = 0. separate_recordings = np.array(separate_recordings) # Mix down the recorded signals mics_signals = np.sum(separate_recordings, axis=0) ## STFT analysis # shape == (n_chan, n_frames, n_freq) X = pra.transform.analysis(mics_signals.T, L, L, zp_front=L // 2, zp_back=L // 2, bits=64) X_test = np.array([ pra.stft(ch, L, L, transform=np.fft.rfft, zp_front=L // 2, zp_back=L // 2) for ch in mics_signals ]) X_test = np.moveaxis(X_test, 0, 2) ## START BSS if choices[algo] == 'auxIVA': # Run AuxIVA Y = pra.bss.auxiva(X, n_iter=30, proj_back=True) max_mse = 1e-5 elif choices[algo] == 'ILRMA': # Run ILRMA Y = pra.bss.ilrma(X, n_iter=30, n_components=30, proj_back=True) max_mse = 1e-5 elif choices[algo] == 'sparseauxIVA': # Estimate set of active frequency bins ratio = 0.35 average = np.abs(np.mean(np.mean(X, axis=2), axis=0)) k = np.int_(average.shape[0] * ratio) S = np.sort(np.argpartition(average, -k)[-k:]) # Run SparseAuxIva Y = pra.bss.sparseauxiva(X, S, n_iter=30, proj_back=True) max_mse = 1e-4 ## STFT Synthesis y = pra.transform.synthesis(Y, L, L, zp_front=L // 2, zp_back=L // 2, bits=64).T # Calculate MES ############# ref = np.moveaxis(separate_recordings, 1, 2) y_aligned = y[:, L // 2:ref.shape[1] + L // 2] mse = np.mean((ref[:, :y_aligned.shape[1], 0] - y_aligned)**2) input_variance = np.var(np.concatenate(signals)) print( '%s with frame length of %d: Relative MSE (expected less than %.e)' % (choices[algo], L, max_mse), mse / input_variance) assert (mse / input_variance) < max_mse
for s in speech_signals.T: s[:] = pra.highpass(s, fs, fc=150.) for s in silence.T: s[:] = pra.highpass(s, fs, fc=150.) # Normalize the amplitude n_factor = 0.95 / np.max(np.abs(speech_signals)) speech_signals *= n_factor silence *= n_factor # estimate noise floor y_noise_stft = [] for k in range(num_mic): y_stft = pra.stft(silence[:, k], fft_size, frame_shift_step, transform=rfft, win=win_stft).T / np.sqrt(fft_size) y_noise_stft.append(y_stft) y_noise_stft = np.array(y_noise_stft) noise_floor = np.mean(np.abs(y_noise_stft)**2) # estimate SNR in dB (on 1st microphone) noise_var = np.mean(np.abs(silence)**2) sig_var = np.mean(np.abs(speech_signals)**2) # rought estimate of SNR SNR = 10 * np.log10((sig_var - noise_var) / noise_var) print('Estimated SNR: ' + str(SNR)) # Compute DFT of snapshots # -------------------------
def beamformed_doa_plot(comb): f1_data = f1['data'] f2_data = f2['data'] # azimuth = np.array([math.atan2(1.5, 0.5), math.atan2(1.5, -0.5)]) azimuth = np.array([ 90., 270., ]) * np.pi / 180 distance = 1.5 c = 343. # speed of sound fs = 16000 # sampling frequency nfft = 256 # FFT size freq_range = [300, 400] sr = 16000 snr_db = 5. # signal-to-noise ratio # sigma2 = 10**(-snr_db / 10) / (4. * np.pi * distance)**2 # Add sources of 1 second duration rng = np.random.RandomState(23) duration_samples = int(sr) room_dim = np.r_[4., 6.] room = pra.ShoeBox(room_dim, fs=sr) echo = pra.linear_2D_array(center=(room_dim / 2), M=5, phi=0, d=0.5) room.add_microphone_array(pra.MicrophoneArray(echo, room.fs)) # R = pra.linear_2D_array([2, 1.5], 4, 0, 0.04) # source_location = room_dim / 2 + distance * np.r_[np.cos(ang), np.sin(ang)] # source_signal = rng.randn(duration_samples) # room.add_source(source_location, signal=source_signal) # room.add_source(np.array([1.5, 4.5]), delay=0., signal=f1_data) # room.add_source(np.array([2.5, 4.5]), delay=0., signal=f2_data[:len(f1_data)]) for ang in azimuth: source_location = room_dim / 2 + distance * np.r_[np.cos(ang), np.sin(ang)] source_signal = rng.randn(duration_samples) room.add_source(source_location, signal=source_signal) room.simulate() X = np.array([ pra.stft(signal, nfft, nfft // 2, transform=np.fft.rfft).T for signal in room.mic_array.signals ]) # DOA_algorithm = 'MUSIC' # spatial_resp = dict() doa = pra.doa.algorithms['MUSIC'](echo, fs, nfft, c=c, num_src=2, max_four=4) # this call here perform localization on the frames in X doa.locate_sources(X, freq_range=freq_range) spatial_resp = doa.grid.values # normalize min_val = spatial_resp.min() max_val = spatial_resp.max() spatial_resp = (spatial_resp - min_val) / (max_val - min_val) # plotting param base = 1. height = 10. true_col = [0, 0, 0] # loop through algos phi_plt = doa.grid.azimuth # plot fig = plt.figure() ax = fig.add_subplot(111, projection='polar') c_phi_plt = np.r_[phi_plt, phi_plt[0]] c_dirty_img = np.r_[spatial_resp, spatial_resp[0]] ax.plot( c_phi_plt, base + height * c_dirty_img, linewidth=3, alpha=0.55, linestyle='-', # label="spatial spectrum" ) # plt.title('MUSIC') # plot true loc # for angle in azimuth: # ax.plot([angle, angle], [base, base + height], linewidth=3, linestyle='--', # color=true_col, alpha=0.6) # K = len(azimuth) # ax.scatter(azimuth, base + height*np.ones(K), c=np.tile(true_col, # (K, 1)), s=500, alpha=0.75, marker='*', # linewidths=0, # # label='true locations' # ) plt.legend() handles, labels = ax.get_legend_handles_labels() ax.legend(handles, labels, framealpha=0.5, scatterpoints=1, loc='center right', fontsize=16, ncol=1, bbox_to_anchor=(1.6, 0.5), handletextpad=.2, columnspacing=1.7, labelspacing=0.1) ax.set_xticks(np.linspace(0, 2 * np.pi, num=12, endpoint=False)) ax.xaxis.set_label_coords(0.5, -0.11) ax.set_yticks(np.linspace(0, 1, 2)) ax.xaxis.grid(b=True, color=[0.3, 0.3, 0.3], linestyle=':') ax.yaxis.grid(b=True, color=[0.3, 0.3, 0.3], linestyle='--') ax.set_ylim([0, 1.05 * (base + height)]) plt.show()
source_location = room_dim / 2 + distance * np.r_[np.cos(azimuth), np.sin(azimuth)] source_signal = np.random.randn((nfft // 2 + 1) * nfft) aroom.add_source(source_location, signal=source_signal) # We use a circular array with radius 15 cm # and 12 microphones R = pra.circular_2D_array(room_dim / 2, 12, 0., 0.15) aroom.add_microphone_array(pra.MicrophoneArray(R, fs=aroom.fs)) # run the simulation aroom.simulate() ################################ # Compute the STFT frames needed X = np.array([ pra.stft(signal, nfft, nfft // 2, transform=np.fft.rfft).T for signal in aroom.mic_array.signals ]) ############################################## # Now we can test all the algorithms available algo_names = sorted(pra.doa.algorithms.keys()) for algo_name in algo_names: # Construct the new DOA object # the max_four parameter is necessary for FRIDA only doa = pra.doa.algorithms[algo_name](R, fs, nfft, c=c, max_four=4) # this call here perform localization on the frames in X doa.locate_sources(X, freq_bins=freq_bins)
def parallel_loop(filename, algo_names, pmt): ''' This is one loop of the computation extracted for parallelization ''' # We need to do a bunch of imports import pyroomacoustics as pra import os import numpy as np from scipy.io import wavfile import mkl as mkl_service import copy import doa from tools import rfft # for such parallel processing, it is better # to deactivate multithreading in mkl mkl_service.set_num_threads(1) # exctract the speaker names from filename name = os.path.splitext(os.path.basename(filename))[0] sources = name.split('-') # number of sources K = len(sources) # Import speech signal fs_file, rec_signals = wavfile.read(filename) # sanity check if pmt['fs'] != fs_file: raise ValueError('The sampling frequency of the files doesn''t match that of the script') speech_signals = np.array(rec_signals[:,pmt['mic_select']], dtype=np.float32) # Remove the DC bias for s in speech_signals.T: s[:] = pra.highpass(s, pmt['fs'], 100.) if pmt['stft_win']: stft_win = np.hanning(pmt['nfft']) else: stft_win = None # Normalize the amplitude speech_signals *= pmt['scaling'] # Compute STFT of signal # ------------------------- y_mic_stft = [] for k in range(speech_signals.shape[1]): y_stft = pra.stft(speech_signals[:, k], pmt['nfft'], pmt['stft_hop'], transform=rfft, win=stft_win).T / np.sqrt(pmt['nfft']) y_mic_stft.append(y_stft) y_mic_stft = np.array(y_mic_stft) # estimate SNR in dB (on 1st microphone) sig_var = np.var(speech_signals) SNR = 10*np.log10( (sig_var - pmt['noise_var']) / pmt['noise_var'] ) freq_bins = copy.copy(pmt['freq_bins'][K-1]) # dict for output phi_recon = {} for alg in algo_names: # Use the convenient dictionary of algorithms defined d = doa.algos[alg]( L=pmt['mic_array'], fs=pmt['fs'], nfft=pmt['nfft'], num_src=K, c=pmt['c'], theta=pmt['phi_grid'], max_four=pmt['M'], num_iter=pmt['num_iter'], G_iter = pmt['G_iter'] ) # perform localization d.locate_sources(y_mic_stft, freq_bins=freq_bins[alg]) # store result phi_recon[alg] = d.phi_recon return SNR, sources, phi_recon
def gen_sig_at_mic_stft(phi_ks, alpha_ks, mic_array_coord, SNR, fs, fft_size=1024, Ns=256): """ generate microphone signals with short time Fourier transform :param phi_ks: azimuth of the acoustic sources :param alpha_ks: power of the sources :param mic_array_coord: x and y coordinates of the microphone array :param SNR: signal to noise ratio at the microphone :param fs: sampling frequency :param fft_size: number of FFT bins :param Ns: number of time snapshots used to estimate covariance matrix :return: y_hat_stft: received (complex) signal at microphones y_hat_stft_noiseless: the noiseless received (complex) signal at microphones """ frame_shift_step = np.int(fft_size / 1.) # half block overlap for adjacent frames K = alpha_ks.shape[0] # number of point sources num_mic = mic_array_coord.shape[1] # number of microphones # Generate the impulse responses for the array and source directions impulse_response = gen_far_field_ir(np.reshape(phi_ks, (1, -1), order='F'), mic_array_coord, fs) # Now generate some noise # source_signal = np.random.randn(K, Ns * fft_size) * np.sqrt(alpha_ks[:, np.newaxis]) source_signal = np.random.randn(K, fft_size + (Ns - 1) * frame_shift_step) * \ np.sqrt(np.reshape(alpha_ks, (-1, 1), order='F')) # Now generate all the microphone signals y = np.zeros( (num_mic, source_signal.shape[1] + impulse_response.shape[2] - 1), dtype=np.float32) for src in xrange(K): for mic in xrange(num_mic): y[mic] += fftconvolve(impulse_response[src, mic], source_signal[src]) # Now do the short time Fourier transform # The resulting signal is M x fft_size/2+1 x number of frames y_hat_stft_noiseless = \ np.array([pra.stft(signal, fft_size, frame_shift_step, transform=mkl_fft.rfft).T for signal in y]) / np.sqrt(fft_size) # compute noise variance based on SNR signal_energy = linalg.norm(y_hat_stft_noiseless.flatten())**2 noise_energy = signal_energy / 10**(SNR * 0.1) sigma2_noise = noise_energy / y_hat_stft_noiseless.size # Add noise to the signals y_noisy = y + np.sqrt(sigma2_noise) * np.array(np.random.randn(*y.shape), dtype=np.float32) y_hat_stft = \ np.array([pra.stft(signal, fft_size, frame_shift_step, transform=mkl_fft.rfft).T for signal in y_noisy]) / np.sqrt(fft_size) return y_hat_stft, y_hat_stft_noiseless
dSNR = pra.dB(room1.dSNR(mics.center[:,0], source=0), power=True) print 'The direct SNR for good source is ' + str(dSNR) # remove a bit of signal at the end n_lim = np.ceil(len(input_mic) - t_cut*Fs) input_clean = signal1[:n_lim] input_mic = input_mic[:n_lim] out_DirectMVDR = out_DirectMVDR[:n_lim] out_RakeMVDR = out_RakeMVDR[:n_lim] out_DirectPerceptual = out_DirectPerceptual[:n_lim] out_RakePerceptual = out_RakePerceptual[:n_lim] # compute time-frequency planes F0 = pra.stft(input_clean, fft_size, fft_hop, win=analysis_window, zp_back=fft_zp) F1 = pra.stft(input_mic, fft_size, fft_hop, win=analysis_window, zp_back=fft_zp) F2 = pra.stft(out_DirectMVDR, fft_size, fft_hop, win=analysis_window, zp_back=fft_zp) F3 = pra.stft(out_RakeMVDR, fft_size, fft_hop, win=analysis_window, zp_back=fft_zp) F4 = pra.stft(out_DirectPerceptual, fft_size, fft_hop, win=analysis_window, zp_back=fft_zp) F5 = pra.stft(out_RakePerceptual, fft_size, fft_hop, win=analysis_window,
female_speakers_train = list(set([s.speaker for s in filter(lambda x: x.sex == 'F', corpus.sentence_corpus['TRAIN'])])) print('Pick a subset of', n_speakers, 'speakers') training_set_speakers = male_speakers_train[:n_speakers] + female_speakers_train[:n_speakers] print(training_set_speakers) # compute all the spectrograms print('Compute all the spectrograms') window = np.sqrt(pra.cosine(stft_win_len)) # use sqrt because of synthesis training_set = dict() testing_set = dict() for speaker in training_set_speakers: training_set_sentences = filter(lambda x: x.speaker == speaker, corpus.sentence_corpus['TRAIN']) # X is (n_sentences, n_channel, n_frame) x = list() X = list() for sentence in training_set_sentences: print(sentence.speaker, sentence.id,) x.append(sentence.samples) X.append(pra.stft(sentence.samples, stft_win_len, stft_win_len // 2, win=window, transform=np.fft.rfft).T) # TRAIN: # Dalia says the magnitude works better... training_set[speaker] = np.concatenate([np.abs(spectrogram)**2 for spectrogram in X[0:9]], axis=1) # TEST: testing_set[speaker] = x[-1] print('Train the dictionary...') W_dictionary = nmf_train(training_set, n_latent_variables, n_iter=n_iter) W_dictionary /= np.sum(W_dictionary, axis=0)[None,:] np.savez('W_dictionary_em.npz', speakers=list(training_set.keys()), W_dictionary=W_dictionary, testing_set=testing_set)
# channels[1] = raw_channels[2] # channels[2] = raw_channels[1] # channels = raw_data.T[::-1] c = 13503.94 # speed of sound in inches/second nfft = 256 # FFT size freq_range = [300, 3500] mic_positions = pra.circular_2D_array(center=(0, 0), M=3, phi0=-math.pi / 6, radius=8.66) print(mic_positions) X = np.array([ pra.stft(channel, nfft, nfft // 2, transform=np.fft.rfft).T for channel in channels ]) doa = pra.doa.algorithms["MUSIC"](mic_positions, fs, nfft, c=c, num_src=1, max_four=4) doa.locate_sources(X, freq_range=freq_range) # IPython.embed() spatial_resp = doa.grid.values phi_plt = doa.grid.azimuth
dSNR = pra.dB(room1.dSNR(mics.center[:, 0], source=0), power=True) print 'The direct SNR for good source is ' + str(dSNR) # remove a bit of signal at the end n_lim = np.ceil(len(input_mic) - t_cut * Fs) input_clean = signal1[:n_lim] input_mic = input_mic[:n_lim] out_DirectMVDR = out_DirectMVDR[:n_lim] out_RakeMVDR = out_RakeMVDR[:n_lim] out_DirectPerceptual = out_DirectPerceptual[:n_lim] out_RakePerceptual = out_RakePerceptual[:n_lim] # compute time-frequency planes F0 = pra.stft(input_clean, fft_size, fft_hop, win=analysis_window, zp_back=fft_zp) F1 = pra.stft(input_mic, fft_size, fft_hop, win=analysis_window, zp_back=fft_zp) F2 = pra.stft(out_DirectMVDR, fft_size, fft_hop, win=analysis_window, zp_back=fft_zp) F3 = pra.stft(out_RakeMVDR, fft_size, fft_hop,
def run_at_azim(azim): azim_binned = azim / 5 soruce_files = glob( '/om/user/francl/recorded_binaural_audio_4078_main_kemar_rescaled/*_{}_azim.wav' .format(azim)) df = pd.DataFrame( columns=["azim", "predicted", "algorithm", "source_name"]) for fname in soruce_files[:7]: freq, stim = read(fname) source_name = os.path.basename(fname) stim = stim.T X = np.array([ pra.stft(signal, nfft, nfft // 2, transform=np.fft.rfft).T for signal in stim ]) algo_names = ['SRP', 'MUSIC', 'TOPS', 'CSSM', 'WAVES'] spatial_resp = dict() microphone = np.array([[0 - (mic_offset * 0.01) / 2.0, 0], [0 + (mic_offset * 0.01) / 2.0, 0]]).T # loop through algos for algo_name in algo_names: # Construct the new DOA object # the max_four parameter is necessary for FRIDA only doa = pra.doa.algorithms[algo_name](microphone, fs, nfft, c=c, num_src=1, max_four=4, n_grid=72) # this call here perform localization on the frames in X doa.locate_sources(X, freq_range=freq_range) # store spatial response if algo_name is 'FRIDA': spatial_resp[algo_name] = np.abs(doa._gen_dirty_img()) else: spatial_resp[algo_name] = doa.grid.values # normalize min_val = spatial_resp[algo_name].min() max_val = spatial_resp[algo_name].max() spatial_resp[algo_name] = (spatial_resp[algo_name] - min_val) / (max_val - min_val) for k, v in spatial_resp.items(): rolled_response = np.roll(v, -18) predicted = rolled_response.argmax() predicted_folded = fold_locations_full_dist_5deg( rolled_response).argmax() predicted_folded_rolled = add_fold_offset(predicted_folded, predicted, azim_binned) df = df.append( { "predicted_folded": predicted_folded_rolled, "azim": azim_binned, "predicted": predicted, "algorithm": k, "source_name": source_name }, ignore_index=True) return df
room.simulate() # sound-to-light sensor # we assume there is no propagation delay between speaker and sensor leds = LightArray2(src_loc, fs=fs_light) leds.record(target_audio + np.random.randn(*target_audio.shape) * sigma_n, fs=fs_sound) leds_sig = leds.signals - leds.signals.min() leds_sig /= leds_sig.max() leds_time = np.arange(leds.signals.shape[0]) / fs_light # perform VAD on the light signal vad = leds.signals > vad_thresh # Now compute the STFT of the microphone input X = np.moveaxis([ pra.stft(a, nfft, nfft//2, np.fft.rfft, win=pra.hann(nfft)) for a in room.mic_array.signals ], 0, -1) X_time = np.arange(1, X.shape[0]+1) * (nfft / 2) / fs_sound # we need to match the VAD to sampling rate of X vad_x = np.zeros(X_time.shape[0], dtype=bool) v = 0 for i,t in enumerate(X_time): if v < leds_time.shape[0] - 1 and abs(t - leds_time[v]) > abs(t - leds_time[v+1]): v += 1 vad_x[i] = vad[v] vad_x_comp = np.logical_not(vad_x) # covariance matrix Rs = np.einsum('i...j,i...k->...jk', X[vad_x,:,:], np.conj(X[vad_x,:,:])) / np.sum(vad_x) Rn = np.einsum('i...j,i...k->...jk', X[vad_x_comp,:,:], np.conj(X[vad_x_comp,:,:])) / np.sum(vad_x_comp)
]) sdr, sir, sar, perm = bss_eval_sources( ref[:, :y.shape[1] - L // 2, 0], y[:, L // 2:ref.shape[1] + L // 2]) SDR.append(sdr) SIR.append(sir) # START BSS ########### # The STFT needs front *and* back padding # shape == (n_chan, n_frames, n_freq) X = np.array([ pra.stft(ch, L, L, transform=np.fft.rfft, zp_front=L // 2, zp_back=L // 2) for ch in mics_signals ]) X = np.moveaxis(X, 0, 2) # Run AuxIVA Y = pra.bss.auxiva(X, n_iter=30, proj_back=True, callback=convergence_callback) # run iSTFT y = np.array([ pra.istft(Y[:, :, ch], L,
def test_ilrma(): # STFT frame length L = 256 # Room 4m by 6m room_dim = [8, 9] # source location source = np.array([1, 4.5]) # create an anechoic room with sources and mics room = pra.ShoeBox(room_dim, fs=16000, max_order=0, sigma2_awgn=1e-8) # get signals signals = [ np.concatenate( [wavfile.read(f)[1].astype(np.float32) for f in source_files]) for source_files in wav_files ] delays = [1., 0.] locations = [[2.5, 3], [2.5, 6]] # add mic and good source to room # Add silent signals to all sources for sig, d, loc in zip(signals, delays, locations): room.add_source(loc, signal=np.zeros_like(sig), delay=d) # add microphone array room.add_microphone_array( pra.MicrophoneArray(np.c_[[6.5, 4.49], [6.5, 4.51]], fs=room.fs)) # compute RIRs room.compute_rir() # Record each source separately separate_recordings = [] for source, signal in zip(room.sources, signals): source.signal[:] = signal room.simulate() separate_recordings.append(room.mic_array.signals) source.signal[:] = 0. separate_recordings = np.array(separate_recordings) # Mix down the recorded signals mics_signals = np.sum(separate_recordings, axis=0) # START BSS ########### # shape == (n_chan, n_frames, n_freq) X = np.array([ pra.stft(ch, L, L, transform=np.fft.rfft, zp_front=L // 2, zp_back=L // 2) for ch in mics_signals ]) X = np.moveaxis(X, 0, 2) # Run ILRMA Y = pra.bss.ilrma(X, n_iter=30, n_components=30, proj_back=True) # run iSTFT y = np.array([ pra.istft(Y[:, :, ch], L, L, transform=np.fft.irfft, zp_front=L // 2, zp_back=L // 2) for ch in range(Y.shape[2]) ]) # Compare SIR ############# ref = np.moveaxis(separate_recordings, 1, 2) y_aligned = y[:, L // 2:ref.shape[1] + L // 2] mse = np.mean((ref[:, :, 0] - y_aligned)**2) input_variance = np.var(np.concatenate(signals)) print('Relative MSE (expect less than 1e-5):', mse / input_variance) assert (mse / input_variance) < 1e-5
# propagation filter bank propagation_vector = -np.array([np.cos(azimuth), np.sin(azimuth)]) delays = np.dot(R.T, propagation_vector) / c * fs # in fractional samples filter_bank = pra.fractional_delay_filter_bank(delays) # we use a white noise signal for the source x = np.random.randn((nfft // 2 + 1) * nfft) # convolve the source signal with the fractional delay filters # to get the microphone input signals mic_signals = [fftconvolve(x, filter, mode='same') for filter in filter_bank] X = np.array([ pra.stft(signal, nfft, nfft // 2, win=np.hanning(nfft), transform=np.fft.rfft).T for signal in mic_signals ]) class TestDOA(TestCase): def test_music(self): doa = pra.doa.algorithms['MUSIC'](R, fs, nfft, c=c) doa.locate_sources(X, freq_bins=freq_bins) print('distance:', pra.doa.circ_dist(azimuth, doa.azimuth_recon)) self.assertTrue(pra.doa.circ_dist(azimuth, doa.azimuth_recon) < tol) def test_srp_phat(self): doa = pra.doa.algorithms['SRP'](R, fs, nfft, c=c) doa.locate_sources(X, freq_bins=freq_bins)
def test_sparseauxiva(): signals = [np.concatenate([wavfile.read(f)[1].astype(np.float32, order='C') for f in source_files]) for source_files in wav_files] # Define a room environment, as well as the microphone array and source locations. ########### # Room dimensions in meters room_dim = [8, 9] # source locations and delays locations = [[2.5, 3], [2.5, 6]] delays = [1., 0.] # create a room with sources and mics room = pra.ShoeBox(room_dim, fs=16000, max_order=15, absorption=0.35, sigma2_awgn=1e-8) # add mic and good source to room # Add silent signals to all sources for sig, d, loc in zip(signals, delays, locations): room.add_source(loc, signal=np.zeros_like(sig), delay=d) # add microphone array room.add_microphone_array(pra.MicrophoneArray(np.c_[[6.5, 4.49], [6.5, 4.51]], room.fs)) # Compute the RIRs as in the Room Impulse Response generation section. # compute RIRs room.compute_rir() # Record each source separately separate_recordings = [] for source, signal in zip(room.sources, signals): source.signal[:] = signal room.simulate() separate_recordings.append(room.mic_array.signals) source.signal[:] = 0. separate_recordings = np.array(separate_recordings) # Mix down the recorded signals ########### mics_signals = np.sum(separate_recordings, axis=0) # STFT frame length L = 2048 # Observation vector in the STFT domain X = np.array([pra.stft(ch, L, L, transform=np.fft.rfft, zp_front=L // 2, zp_back=L // 2) for ch in mics_signals]) X = np.moveaxis(X, 0, 2) # START BSS ########### # Estimate set of active frequency bins ratio = 0.35 average = np.abs(np.mean(np.mean(X, axis=2), axis=0)) k = np.int_(average.shape[0] * ratio) S = np.sort(np.argpartition(average, -k)[-k:]) # Run SparseAuxIva Y = pra.bss.sparseauxiva(X, S) # run iSTFT y = np.array([pra.istft(Y[:, :, ch], L, L, transform=np.fft.irfft, zp_front=L // 2, zp_back=L // 2) for ch in range(Y.shape[2])]) # Compare SIR ############# ref = np.moveaxis(separate_recordings, 1, 2) y_aligned = y[:,L//2:ref.shape[1]+L//2] mse = np.mean((ref[:,:,0] - y_aligned)**2) input_variance = np.var(np.concatenate(signals)) print('Relative MSE (expect less than 1e-3):', mse / input_variance) assert (mse / input_variance) < 1e-3
def multinmf_conv_mu_wrapper(x, n_src, n_latent_var, stft_win_len, partial_rirs=None, W_dict=None, n_iter=500, l1_reg=0., random_seed=0, verbose=False): ''' A wrapper around multichannel nmf using MU updates to use with pyroormacoustcs. Performs STFT and ensures all signals are the correct shape. Parameters ---------- x: ndarray (n_samples x n_channel) array of time domain samples n_src: int The number of sources n_latent_var: int The number of latent variables in the NMF stft_win_len: The length of the STFT window partial_rirs: array_like, optional (n_channel x n_src x n_bins) array of partial TF. If provided, Q is not optimized. W_dict: array_like, optional A dictionary of atoms that can be used in the NMF. If provided, W is not optimized. n_iter: int, optional The number of iterations of NMF (default 500) l1_reg: float, optional The weight of the l1 regularization term for the activations (default 0., not regularized) random_seed: unsigned int, optional The seed to provide to the RNG prior to initialization of NMF parameters. This allows to use repeatable initialization. verbose: bool, optional When true, prints convergence info of NMF (default False) ''' n_channel = x.shape[1] # STFT window = np.sqrt(pra.cosine(stft_win_len)) # use sqrt because of synthesis # X is (n_channel, n_frame, n_bin) X = np.array([ pra.stft(x[:, ch], stft_win_len, stft_win_len // 2, win=window, transform=np.fft.rfft) for ch in range(n_channel) ]) # move axes to match Ozerov's order (n_bin, n_frame, n_channel) X = np.moveaxis(X, [0, 1, 2], [2, 1, 0]) n_bin = X.shape[0] n_frame = X.shape[1] # Squared magnitude and unit energy per bin V = np.abs(X)**2 V /= np.mean(V) # Random initialization of multichannel NMF parameters np.random.seed(random_seed) K = n_latent_var * n_src source_NMF_ind = [] for j in range(n_src): source_NMF_ind = np.reshape( np.arange(n_latent_var * n_src, dtype=np.int), (n_src, -1)) mix_psd = np.mean(V, axis=(1, 2)) # W is intialized so that its enegy follows mixture PSD if W_dict is None: W_init = 0.5 * ((np.abs(np.random.randn(n_bin, K)) + np.ones( (n_bin, K))) * (mix_psd[:, np.newaxis] * np.ones((1, K)))) fix_W = False else: if W_dict.shape[1] == n_latent_var: W_init = np.tile(W_dict, n_src) elif W_dict.shape[1] == n_src * n_latent_var: W_init = W_dict else: raise ValueError( 'Mismatch between dictionary size and latent variables') fix_W = True # follow average activations mix_act = np.mean(V, axis=(0, 2)) H_init = 0.5 * (np.abs(np.random.randn(K, n_frame)) + np.ones( (K, n_frame))) * mix_act[np.newaxis, :] if partial_rirs is not None: # squared mag partial rirs (n_bin, n_channel, n_src) Q_init = np.moveaxis(np.abs(partial_rirs)**2, [2], [0]) Q_init /= np.max(Q_init, axis=0)[None, :, :] fix_Q = True else: # random initialization Q_shape = (n_bin, n_channel, n_src) Q_init = (0.5 * (1.9 * np.abs(np.random.randn(*Q_shape)) + 0.1 * np.ones(Q_shape)))**2 fix_Q = False # RUN NMF W_MU, H_MU, Q_MU, cost = \ multinmf_conv_mu( np.abs(X)**2, W_init, H_init, Q_init, source_NMF_ind, n_iter=n_iter, fix_Q=fix_Q, fix_W=fix_W, H_l1_reg=l1_reg, verbose=verbose) # Computation of the spatial source images Im = multinmf_recons_im(X, W_MU, H_MU, Q_MU, source_NMF_ind) sep_sources = [] # Inverse STFT for j in range(n_src): # channel-wise istft with synthesis window ie_MU = [] for ch in range(n_channel): ie_MU.append( pra.istft(Im[:, :, j, ch].T, stft_win_len, stft_win_len // 2, win=window, transform=np.fft.irfft)) sep_sources.append(np.array(ie_MU).T) return np.array(sep_sources)
def multinmf_conv_em_wrapper( x, n_src, stft_win_len, n_latent_var, n_iter=500, \ A_init=None, W_init=None, H_init=None, \ update_a=True, update_w=True, update_h=True, \ verbose = False): ''' A wrapper around multichannel nmf using EM updates to use with pyroormacoustcs. Performs STFT and ensures all signals are the correct shape. Parameters ---------- x: ndarray (n_samples x n_chan) array of time domain samples n_latent_var: int number of latent variables in the NMF ''' n_chan = x.shape[1] # STFT window = np.sqrt(pra.cosine(stft_win_len)) # use sqrt because of synthesis # X is (n_chan, n_frame, n_bin) X = np.array( [pra.stft(x[:,ch], stft_win_len, stft_win_len // 2, win=window, transform=np.fft.rfft) for ch in range(n_chan)] ) # move axes to match Ozerov's order (n_bin, n_frame, n_chan) X = np.moveaxis(X, [0,1,2], [2,1,0]) n_bin = X.shape[0] n_frame = X.shape[1] if W_init is None: K = n_latent_var * n_src else: K = W_init.shape[-1] # Random initialization of multichannel NMF parameters source_NMF_ind = [] for j in range(n_src): source_NMF_ind = np.reshape(np.arange(K, dtype=np.int), (n_src,-1)) mix_psd = 0.5 * (np.mean(np.sum(np.abs(X)**2, axis=2), axis=1)) if A_init is None: # random initialization update_a = True A_init = (0.5 * ( 1.9 * np.abs(random.randn(n_bin, n_chan, n_src)) \ + 0.1 * np.ones((n_bin, n_chan, n_src)) \ ) * np.sign( random.randn(n_bin, n_chan, n_src) \ + 1j * random.randn(n_bin, n_chan, n_src)) \ ) else: # reshape the partial rir input (n_bin, n_chan, n_src) A_init = np.moveaxis(A_init, [2], [0]) # W is intialized so that its enegy follows mixture PSD if W_init is None: W_init = 0.5 * ( ( np.abs(np.random.randn(n_bin,K)) + np.ones((n_bin,K)) ) * ( mix_psd[:,np.newaxis] * np.ones((1,K)) ) ) if H_init is None: H_init = 0.5 * ( np.abs(np.random.randn(K,n_frame)) + np.ones((K,n_frame)) ) Sigma_b_init = mix_psd / 100 W_EM, H_EM, Ae_EM, Sigma_b_EM, Se_EM, log_like_arr = \ multinmf_conv_em(X, W_init, H_init, A_init, Sigma_b_init, source_NMF_ind, iter_num=n_iter, update_a=update_a, update_w=update_w, update_h=update_h, verbose=verbose) Ae_EM = np.moveaxis(Ae_EM, [0], [2]) # Computation of the spatial source images if verbose: print('Computation of the spatial source images\n') Ie_EM = np.zeros((n_bin,n_frame,n_src,n_chan), dtype=np.complex) for j in range(n_src): for f in range(n_bin): Ie_EM[f,:,j,:] = np.outer(Se_EM[f,:,j], Ae_EM[:,j,f]) sep_sources = [] # Inverse STFT ie_EM = [] for j in range(n_src): # channel-wise istft with synthesis window ie_EM = [] for ch in range(n_chan): ie_EM.append( pra.istft(Ie_EM[:,:,j,ch].T, stft_win_len, stft_win_len // 2, win=window, transform=np.fft.irfft) ) sep_sources.append(np.array(ie_EM).T) return np.array(sep_sources)
c = 343 fs = 16000 nfft = 512 #Possible dos algorithms: SRP, MUSIC, TOPS, CSSM, WAVES doa = pra.doa.algorithms['SRP'](R, fs, nfft, c=c) plt.figure() with MicArray(fs, 4, fs/4) as mic: start = time.time() for chunk in mic.read_chunks(): #print(chunk.shape) #pixels.wakeup(np.random.randint(0, 360, 1)) X = np.array([pra.stft(chunk[i::4], nfft, nfft//2, transform=np.fft.rfft).T for i in range(4)]) doa.locate_sources(X, freq_range=[500, 3000]) direction = doa.azimuth_recon / np.pi * 180 print('Time: ', time.time()-start, ' Recovered azimuth: ', direction) pixels.wakeup(direction) #plt.close() #doa.polar_plt_dirac() #plt.draw() #plt.pause(0.0001) if is_quit.is_set(): break
def _preprocessing(audio): X = np.array([pra.stft(signal, nfft, nfft // 2, transform=np.fft.rfft).T for signal in audio]) return X