def rir(self, fname, fs=16000, rir_nsamps=4096, v=340, gpu=False): """ Generate rir for current settings """ if gpu: # self.beta: rt60 beta = pygpurir.beta_SabineEstimation(self.size, self.beta) # NOTE: do not clear here # diff = pygpurir.att2t_SabineEstimator(15, self.beta) tmax = rir_nsamps / fs nb_img = pygpurir.t2n(tmax, self.size) # S x R x T rir = pygpurir.simulateRIR(self.size, beta, np.array(self.spos)[None, ...], np.array(self.rpos), nb_img, tmax, fs, mic_pattern="omni") write_wav(fname, rir[0], fs=fs) elif cpp_rir_available: # format float ffloat = lambda f: "{:.3f}".format(f) # location for each microphone loc_for_each_channel = [ ",".join(map(ffloat, p)) for p in self.rpos ] beta = ",".join(map(ffloat, self.beta)) if isinstance( self.beta, list) else round(self.beta, 3) run_command( "rir-simulate --sound-velocity={v} --samp-frequency={sample_rate} " "--hp-filter=true --number-samples={rir_samples} --beta={beta} " "--room-topo={room_size} --receiver-location=\"{receiver_location}\" " "--source-location={source_location} {dump_dest}".format( v=v, sample_rate=fs, rir_samples=rir_nsamps, room_size=",".join(map(ffloat, self.size)), beta=beta, receiver_location=";".join(loc_for_each_channel), source_location=",".join(map(ffloat, self.spos)), dump_dest=fname)) elif pyrirgen_available: rir = pyrirgen.generateRir(self.size, self.spos, self.rpos, soundVelocity=v, fs=fs, nDim=3, nSamples=rir_nsamps, nOrder=-1, reverbTime=self.beta, micType="omnidirectional", isHighPassFilter=True) if isinstance(rir, list): rir = np.stack(rir) write_wav(fname, rir, fs=fs) else: raise RuntimeError("Both rir-simulate and pyrirgen unavailable")
[0, 0.3, 0.3 * 2, 0.3 * 3, 0.3 * 4, 0.3 * 5]] # Generate RIRs RIRs = np.zeros((0, 5)) for r in range(0, len(rooms)): room_sz = rooms[r] arr_centre = np.random.uniform(MIN_ARR2WALL, room_sz - MAX_ARR2WALL, size=[1, 3]) arr = arr_centre[-1] * np.ones((3, 5)) arr[0, 0] = arr_centre[0] - 0.08 arr[0, 1] = arr_centre[0] - 0.04 arr[0, 2] = arr_centre[0] arr[0, 3] = arr_centre[0] + 0.04 arr[0, 4] = arr_centre[0] + 0.08 arr[1, :] = arr_centre[1] pos_rcv = arr.T for rt in range(0, 6): T60 = rev_times[r] dist = distances[r] for ang in tqdm(range(0, 37)): x = np.cos(angles[ang]) * dist + arr_centre[0, 0] y = np.sin(angles[ang]) * dist + arr_centre[0, 1] pos_src = np.array([[x, y, 1.5]]) beta = gpuRIR.beta_SabineEstimation( room_sz, T60[rt]) # Reflection coefficients nb_img = gpuRIR.t2n( Tdiff, room_sz) # Number of image sources in each dimension _rirs = gpuRIR.simulateRIR(room_sz, beta, pos_src, pos_rcv, nb_img, Tmax, fs).reshape(5, 9600).T RIRs = np.concatenate((RIRs, _rirs))
fs = 16000.0 # Sampling frequency [Hz] pos_src = np.random.rand(nb_src, 3) * room_sz pos_rcv = np.random.rand(nb_rcv, 3) * room_sz time_max = 100 # Stop the measurements after find an average time greter than this time [s] times = np.zeros((len(T60_vec), 1)) for i in range(len(T60_vec)): T60 = T60_vec[i] start_time = time.time() for j in range(nb_test_per_point): beta = gpuRIR.beta_SabineEstimation(room_sz, T60) Tdiff = gpuRIR.att2t_SabineEstimator(att_diff, T60) Tmax = gpuRIR.att2t_SabineEstimator(att_max, T60) nb_img = gpuRIR.t2n(Tdiff, room_sz) RIRs = gpuRIR.simulateRIR(room_sz, beta, pos_src, pos_rcv, nb_img, Tmax, fs, Tdiff=Tdiff) times[i] = (time.time() - start_time) / nb_test_per_point if times[i] > time_max: break print(times.transpose())
def simulate(self): """ Get the array recording using gpuRIR to perform the acoustic simulations. """ if self.T60 == 0: Tdiff = 0.1 Tmax = 0.1 nb_img = [1, 1, 1] else: Tdiff = gpuRIR.att2t_SabineEstimator( 12, self.T60) # Use ISM until the RIRs decay 12dB Tmax = gpuRIR.att2t_SabineEstimator( 40, self.T60) # Use diffuse model until the RIRs decay 40dB if self.T60 < 0.15: Tdiff = Tmax # Avoid issues with too short RIRs nb_img = gpuRIR.t2n(Tdiff, self.room_sz) nb_mics = len(self.mic_pos) nb_traj_pts = len(self.traj_pts) nb_gpu_calls = min( int( np.ceil(self.fs * Tdiff * nb_mics * nb_traj_pts * np.prod(nb_img) / 1e9)), nb_traj_pts) traj_pts_batch = np.ceil(nb_traj_pts / nb_gpu_calls * np.arange(0, nb_gpu_calls + 1)).astype(int) RIRs_list = [ gpuRIR.simulateRIR( self.room_sz, self.beta, self.traj_pts[traj_pts_batch[0]:traj_pts_batch[1], :], self.mic_pos, nb_img, Tmax, self.fs, Tdiff=Tdiff, orV_rcv=self.array_setup.mic_orV, mic_pattern=self.array_setup.mic_pattern) ] for i in range(1, nb_gpu_calls): RIRs_list += [ gpuRIR.simulateRIR( self.room_sz, self.beta, self.traj_pts[traj_pts_batch[i]:traj_pts_batch[i + 1], :], self.mic_pos, nb_img, Tmax, self.fs, Tdiff=Tdiff, orV_rcv=self.array_setup.mic_orV, mic_pattern=self.array_setup.mic_pattern) ] RIRs = np.concatenate(RIRs_list, axis=0) mic_signals = gpuRIR.simulateTrajectory(self.source_signal, RIRs, timestamps=self.timestamps, fs=self.fs) mic_signals = mic_signals[0:len(self.t), :] # Omnidirectional noise dp_RIRs = gpuRIR.simulateRIR(self.room_sz, self.beta, self.traj_pts, self.mic_pos, [1, 1, 1], 0.1, self.fs, orV_rcv=self.array_setup.mic_orV, mic_pattern=self.array_setup.mic_pattern) dp_signals = gpuRIR.simulateTrajectory(self.source_signal, dp_RIRs, timestamps=self.timestamps, fs=self.fs) ac_pow = np.mean([ acoustic_power(dp_signals[:, i]) for i in range(dp_signals.shape[1]) ]) noise = np.sqrt( ac_pow / 10**(self.SNR / 10)) * np.random.standard_normal( mic_signals.shape) mic_signals += noise # Apply the propagation delay to the VAD information if it exists if hasattr(self, 'source_vad'): vad = gpuRIR.simulateTrajectory(self.source_vad, dp_RIRs, timestamps=self.timestamps, fs=self.fs) self.vad = vad[0:len(self.t), :].mean( axis=1) > vad[0:len(self.t), :].max() * 1e-3 return mic_signals
def generate_data(output_path='', dataset='adhoc', libri_path='/hdd/data/Librispeech/LibriSpeech', noise_path='/hdd/data/Nonspeech'): assert dataset in ['adhoc', 'fixed'], "dataset can only be adhoc or fixed." if output_path == '': output_path = os.getcwd() data_type = ['train', 'validation', 'test'] for i in range(len(data_type)): # path for config config_path = os.path.join( 'configs', 'MC_Libri_' + dataset + '_' + data_type[i] + '.pkl') # load pickle file with open(config_path, 'rb') as f: configs = pickle.load(f) # sample rate is 16k Hz sr = 16000 # signal length is 4 sec sig_len = 4 # generate and save audio save_dir = os.path.join(output_path, 'MC_Libri_' + dataset, data_type[i]) if not os.path.exists(save_dir): os.makedirs(save_dir) for utt in range(len(configs)): this_config = configs[utt] # load audio files speakers = this_config['speech'] noise = this_config['noise'] spk1, _ = sf.read(os.path.join(libri_path, speakers[0])) spk2, _ = sf.read(os.path.join(libri_path, speakers[1])) noise, _ = sf.read(os.path.join(noise_path, noise)) # calculate signal length according to overlap ratio overlap_ratio = this_config['overlap_ratio'] actual_len = int(sig_len / (2 - overlap_ratio)) * sr overlap = int(actual_len * overlap_ratio) # truncate speech according to start and end indexes start_idx = this_config['start_idx'] end_idx = this_config['end_idx'] spk1 = spk1[start_idx:end_idx] spk2 = spk2[start_idx:end_idx] # rescaling spk2 energy according to relative SNR spk1 = spk1 / np.sqrt(np.sum(spk1**2) + 1e-8) * 1e2 spk2 = spk2 / np.sqrt(np.sum(spk2**2) + 1e-8) * 1e2 spk2 = spk2 * np.power(10, this_config['spk_snr'] / 20.) # load locations and room configs mic_pos = np.asarray(this_config['mic_pos']) spk_pos = np.asarray(this_config['spk_pos']) noise_pos = np.asarray(this_config['noise_pos']) room_size = np.asarray(this_config['room_size']) rt60 = this_config['RT60'] # generate RIR beta = gpuRIR.beta_SabineEstimation(room_size, rt60) nb_img = gpuRIR.t2n(rt60, room_size) spk_rir = gpuRIR.simulateRIR(room_size, beta, spk_pos, mic_pos, nb_img, rt60, sr) noise_rir = gpuRIR.simulateRIR(room_size, beta, noise_pos, mic_pos, nb_img, rt60, sr) # convolve with RIR at different mic if dataset == 'adhoc': nmic = this_config['num_mic'] else: nmic = 6 for mic in range(nmic): spk1_echoic_sig = signal.fftconvolve(spk1, spk_rir[0][mic]) spk2_echoic_sig = signal.fftconvolve(spk2, spk_rir[1][mic]) # align the speakers according to overlap ratio actual_length = len(spk1_echoic_sig) total_length = actual_length * 2 - overlap padding = np.zeros(actual_length - overlap) spk1_echoic_sig = np.concatenate([spk1_echoic_sig, padding]) spk2_echoic_sig = np.concatenate([padding, spk2_echoic_sig]) mixture = spk1_echoic_sig + spk2_echoic_sig # add noise noise = noise[:total_length] if len(noise) < total_length: # repeat noise if necessary num_repeat = total_length // len(noise) res = total_length - num_repeat * len(noise) noise = np.concatenate( [np.concatenate([noise] * num_repeat), noise[:res]]) noise = signal.fftconvolve(noise, noise_rir[0][mic]) # rescaling noise energy noise = noise[:total_length] noise = noise / np.sqrt(np.sum(noise**2) + 1e-8) * np.sqrt( np.sum(mixture**2) + 1e-8) noise = noise / np.power(10, this_config['noise_snr'] / 20.) mixture += noise # save waveforms this_save_dir = os.path.join(save_dir, str(nmic) + 'mic', 'sample' + str(utt + 1)) if not os.path.exists(this_save_dir): os.makedirs(this_save_dir) sf.write( os.path.join(this_save_dir, 'spk1_mic' + str(mic + 1) + '.wav'), spk1_echoic_sig, sr) sf.write( os.path.join(this_save_dir, 'spk2_mic' + str(mic + 1) + '.wav'), spk2_echoic_sig, sr) sf.write( os.path.join(this_save_dir, 'mixture_mic' + str(mic + 1) + '.wav'), mixture, sr) # print progress if (utt + 1) % (len(configs) // 5) == 0: print( "{} configuration, {} set, {:d} out of {:d} utterances generated." .format(dataset, data_type[i], utt + 1, len(configs)))
def generate_data(output_path='', avoid_clipping=0, dataset='adhoc', libri_path='/home/yi/data/Librispeech', noise_path='/home/yi/data/Nonspeech'): assert dataset in ['adhoc', 'fixed'], "dataset can only be adhoc or fixed." if output_path == '': output_path = os.getcwd() data_type = ['train', 'validation', 'test'] for i in range(len(data_type)): # path for config config_path = os.path.join( 'configs', 'MC_Libri_' + dataset + '_' + data_type[i] + '.pkl') # load pickle file with open(config_path, 'rb') as f: configs = pickle.load(f) # sample rate is 16k Hz sr = 16000 # signal length is 4 sec sig_len = 4 for utt in range(len(configs)): this_config = configs[utt] # load audio files speakers = this_config['speech'] noise = this_config['noise'] spk1, _ = sf.read(os.path.join(libri_path, speakers[0])) spk2, _ = sf.read(os.path.join(libri_path, speakers[1])) noise, _ = sf.read(os.path.join(noise_path, noise)) # calculate signal length according to overlap ratio overlap_ratio = this_config['overlap_ratio'] actual_len = int(sig_len / (2 - overlap_ratio) * sr) overlap = int(actual_len * overlap_ratio) # truncate speech according to start and end indexes start_idx = this_config['start_idx'] end_idx = start_idx + actual_len spk1 = spk1[start_idx:end_idx] spk2 = spk2[start_idx:end_idx] # rescaling speaker and noise energy according to relative SNR spk1 = spk1 / np.sqrt(np.sum(spk1**2) + 1e-8) * 1e2 spk2 = spk2 / np.sqrt(np.sum(spk2**2) + 1e-8) * 1e2 spk2 = spk2 * np.power(10, this_config['spk_snr'] / 20.) # repeat noise if necessary noise = noise[:int(sig_len * sr)] if len(noise) < int(sig_len * sr): num_repeat = int(sig_len * sr) // len(noise) res = int(sig_len * sr) - num_repeat * len(noise) noise = np.concatenate( [np.concatenate([noise] * num_repeat), noise[:res]]) # rescale noise energy w.r.t mixture energy noise = noise / np.sqrt(np.sum(noise**2) + 1e-8) * np.sqrt( np.sum((spk1 + spk2)**2) + 1e-8) noise = noise / np.power(10, this_config['noise_snr'] / 20.) # load locations and room configs mic_pos = np.asarray(this_config['mic_pos']) spk_pos = np.asarray(this_config['spk_pos']) noise_pos = np.asarray(this_config['noise_pos']) room_size = np.asarray(this_config['room_size']) rt60 = this_config['RT60'] num_mic = len(mic_pos) # generate RIR beta = gpuRIR.beta_SabineEstimation(room_size, rt60) nb_img = gpuRIR.t2n(rt60, room_size) spk_rir = gpuRIR.simulateRIR(room_size, beta, spk_pos, mic_pos, nb_img, rt60, sr) noise_rir = gpuRIR.simulateRIR(room_size, beta, noise_pos, mic_pos, nb_img, rt60, sr) # convolve with RIR at different mic echoic_spk1 = [] echoic_spk2 = [] echoic_mixture = [] if dataset == 'adhoc': nmic = this_config['num_mic'] else: nmic = 6 for mic in range(nmic): spk1_echoic_sig = signal.fftconvolve(spk1, spk_rir[0][mic]) spk2_echoic_sig = signal.fftconvolve(spk2, spk_rir[1][mic]) noise_echoic_sig = signal.fftconvolve(noise, noise_rir[0][mic]) # align the speakers according to overlap ratio pad_length = int((1 - overlap_ratio) * actual_len) padding = np.zeros(pad_length) spk1_echoic_sig = np.concatenate([spk1_echoic_sig, padding]) spk2_echoic_sig = np.concatenate([padding, spk2_echoic_sig]) # pad or truncate length to 4s if necessary def pad_sig(x): if len(x) < sig_len * sr: zeros = np.zeros(sig_len * sr - len(x)) return np.concatenate([x, zeros]) else: return x[:sig_len * sr] spk1_echoic_sig = pad_sig(spk1_echoic_sig) spk2_echoic_sig = pad_sig(spk2_echoic_sig) noise_echoic_sig = pad_sig(noise_echoic_sig) # sum up for mixture mixture = spk1_echoic_sig + spk2_echoic_sig + noise_echoic_sig if avoid_clipping: # avoid clipping max_scale = np.max([ np.max(np.abs(mixture)), np.max(np.abs(spk1_echoic_sig)), np.max(np.abs(spk2_echoic_sig)) ]) mixture = mixture / max_scale * 0.9 spk1_echoic_sig = spk1_echoic_sig / max_scale * 0.9 spk2_echoic_sig = spk2_echoic_sig / max_scale * 0.9 # save waveforms this_save_dir = os.path.join(output_path, 'MC_Libri_' + dataset, data_type[i], str(num_mic) + 'mic', 'sample' + str(utt + 1)) if not os.path.exists(this_save_dir): os.makedirs(this_save_dir) sf.write( os.path.join(this_save_dir, 'spk1_mic' + str(mic + 1) + '.wav'), spk1_echoic_sig, sr) sf.write( os.path.join(this_save_dir, 'spk2_mic' + str(mic + 1) + '.wav'), spk2_echoic_sig, sr) sf.write( os.path.join(this_save_dir, 'mixture_mic' + str(mic + 1) + '.wav'), mixture, sr) # print progress if (utt + 1) % (len(configs) // 5) == 0: print( "{} configuration, {} set, {:d} out of {:d} utterances generated." .format(dataset, data_type[i], utt + 1, len(configs)))