def rir(self, fname, fs=16000, rir_nsamps=4096, v=340, gpu=False): """ Generate rir for current settings """ if gpu: # self.beta: rt60 beta = pygpurir.beta_SabineEstimation(self.size, self.beta) # NOTE: do not clear here # diff = pygpurir.att2t_SabineEstimator(15, self.beta) tmax = rir_nsamps / fs nb_img = pygpurir.t2n(tmax, self.size) # S x R x T rir = pygpurir.simulateRIR(self.size, beta, np.array(self.spos)[None, ...], np.array(self.rpos), nb_img, tmax, fs, mic_pattern="omni") write_wav(fname, rir[0], fs=fs) elif cpp_rir_available: # format float ffloat = lambda f: "{:.3f}".format(f) # location for each microphone loc_for_each_channel = [ ",".join(map(ffloat, p)) for p in self.rpos ] beta = ",".join(map(ffloat, self.beta)) if isinstance( self.beta, list) else round(self.beta, 3) run_command( "rir-simulate --sound-velocity={v} --samp-frequency={sample_rate} " "--hp-filter=true --number-samples={rir_samples} --beta={beta} " "--room-topo={room_size} --receiver-location=\"{receiver_location}\" " "--source-location={source_location} {dump_dest}".format( v=v, sample_rate=fs, rir_samples=rir_nsamps, room_size=",".join(map(ffloat, self.size)), beta=beta, receiver_location=";".join(loc_for_each_channel), source_location=",".join(map(ffloat, self.spos)), dump_dest=fname)) elif pyrirgen_available: rir = pyrirgen.generateRir(self.size, self.spos, self.rpos, soundVelocity=v, fs=fs, nDim=3, nSamples=rir_nsamps, nOrder=-1, reverbTime=self.beta, micType="omnidirectional", isHighPassFilter=True) if isinstance(rir, list): rir = np.stack(rir) write_wav(fname, rir, fs=fs) else: raise RuntimeError("Both rir-simulate and pyrirgen unavailable")
[0, 0.3, 0.3 * 2, 0.3 * 3, 0.3 * 4, 0.3 * 5]] # Generate RIRs RIRs = np.zeros((0, 5)) for r in range(0, len(rooms)): room_sz = rooms[r] arr_centre = np.random.uniform(MIN_ARR2WALL, room_sz - MAX_ARR2WALL, size=[1, 3]) arr = arr_centre[-1] * np.ones((3, 5)) arr[0, 0] = arr_centre[0] - 0.08 arr[0, 1] = arr_centre[0] - 0.04 arr[0, 2] = arr_centre[0] arr[0, 3] = arr_centre[0] + 0.04 arr[0, 4] = arr_centre[0] + 0.08 arr[1, :] = arr_centre[1] pos_rcv = arr.T for rt in range(0, 6): T60 = rev_times[r] dist = distances[r] for ang in tqdm(range(0, 37)): x = np.cos(angles[ang]) * dist + arr_centre[0, 0] y = np.sin(angles[ang]) * dist + arr_centre[0, 1] pos_src = np.array([[x, y, 1.5]]) beta = gpuRIR.beta_SabineEstimation( room_sz, T60[rt]) # Reflection coefficients nb_img = gpuRIR.t2n( Tdiff, room_sz) # Number of image sources in each dimension _rirs = gpuRIR.simulateRIR(room_sz, beta, pos_src, pos_rcv, nb_img, Tmax, fs).reshape(5, 9600).T RIRs = np.concatenate((RIRs, _rirs))
room_sz = [3, 4, 2.5] # Room size [m] att_diff = 13.0 # Attenuation when start using the diffuse reverberation model [dB] att_max = 50.0 # Attenuation at the end of the simulation [dB] fs = 16000.0 # Sampling frequency [Hz] pos_src = np.random.rand(nb_src, 3) * room_sz pos_rcv = np.random.rand(nb_rcv, 3) * room_sz time_max = 100 # Stop the measurements after find an average time greter than this time [s] times = np.zeros((len(T60_vec), 1)) for i in range(len(T60_vec)): T60 = T60_vec[i] start_time = time.time() for j in range(nb_test_per_point): beta = gpuRIR.beta_SabineEstimation(room_sz, T60) Tdiff = gpuRIR.att2t_SabineEstimator(att_diff, T60) Tmax = gpuRIR.att2t_SabineEstimator(att_max, T60) nb_img = gpuRIR.t2n(Tdiff, room_sz) RIRs = gpuRIR.simulateRIR(room_sz, beta, pos_src, pos_rcv, nb_img, Tmax, fs, Tdiff=Tdiff) times[i] = (time.time() - start_time) / nb_test_per_point if times[i] > time_max:
def getRandomScene(self, idx): # Source signal source_signal, vad = self.sourceDataset[idx] # Room room_sz = self.room_sz.getValue() T60 = self.T60.getValue() abs_weights = self.abs_weights.getValue() beta = gpuRIR.beta_SabineEstimation(room_sz, T60, abs_weights) # Microphones array_pos = self.array_pos.getValue() * room_sz mic_pos = array_pos + self.array_setup.mic_pos # Trajectory points src_pos_min = np.array([0.0, 0.0, 0.0]) src_pos_max = room_sz if self.array_setup.arrayType == 'planar': if np.sum(self.array_setup.orV) > 0: src_pos_min[np.nonzero( self.array_setup.orV)] = array_pos[np.nonzero( self.array_setup.orV)] else: src_pos_max[np.nonzero( self.array_setup.orV)] = array_pos[np.nonzero( self.array_setup.orV)] src_pos_ini = src_pos_min + np.random.random(3) * (src_pos_max - src_pos_min) src_pos_end = src_pos_min + np.random.random(3) * (src_pos_max - src_pos_min) Amax = np.min(np.stack( (src_pos_ini - src_pos_min, src_pos_max - src_pos_ini, src_pos_end - src_pos_min, src_pos_max - src_pos_end)), axis=0) A = np.random.random(3) * np.minimum( Amax, 1) # Oscilations with 1m as maximum in each axis w = 2 * np.pi / self.nb_points * np.random.random( 3) * 2 # Between 0 and 2 oscilations in each axis traj_pts = np.array([ np.linspace(i, j, self.nb_points) for i, j in zip(src_pos_ini, src_pos_end) ]).transpose() traj_pts += A * np.sin(w * np.arange(self.nb_points)[:, np.newaxis]) if np.random.random(1) < 0.25: traj_pts = np.ones((self.nb_points, 1)) * src_pos_ini # Interpolate trajectory points timestamps = np.arange( self.nb_points) * len(source_signal) / self.fs / self.nb_points t = np.arange(len(source_signal)) / self.fs trajectory = np.array([ np.interp(t, timestamps, traj_pts[:, i]) for i in range(3) ]).transpose() acoustic_scene = AcousticScene(room_sz=room_sz, T60=T60, beta=beta, SNR=self.SNR.getValue(), array_setup=self.array_setup, mic_pos=mic_pos, source_signal=source_signal, fs=self.fs, t=t, traj_pts=traj_pts, timestamps=timestamps, trajectory=trajectory, DOA=cart2sph(trajectory - array_pos)[:, 1:3]) acoustic_scene.source_vad = vad return acoustic_scene
pos_src = np.array([[1, 2.9, 0.5], [1, 2, 0.5]]) # Positions of the sources ([m] nb_rcv = 3 # Number of receivers pos_rcv = np.array([[0.5, 1, 0.5], [1, 1, 0.5], [1.5, 1, 0.5]]) # Position of the receivers [m] orV_rcv = np.matlib.repmat( np.array([0, 1, 0]), nb_rcv, 1) # Vectors pointing in the same direction than the receivers mic_pattern = "card" # Receiver polar pattern abs_weights = [0.9] * 5 + [0.5] # Absortion coefficient ratios of the walls T60 = 1.0 # Time for the RIR to reach 60dB of attenuation [s] att_diff = 15.0 # Attenuation when start using the diffuse reverberation model [dB] att_max = 60.0 # Attenuation at the end of the simulation [dB] fs = 16000.0 # Sampling frequency [Hz] beta = gpuRIR.beta_SabineEstimation( room_sz, T60, abs_weights=abs_weights) # Reflection coefficients Tdiff = gpuRIR.att2t_SabineEstimator( att_diff, T60) # Time to start the diffuse reverberation model [s] Tmax = gpuRIR.att2t_SabineEstimator(att_max, T60) # Time to stop the simulation [s] nb_img = gpuRIR.t2n(Tdiff, room_sz) # Number of image sources in each dimension RIRs = gpuRIR.simulateRIR(room_sz, beta, pos_src, pos_rcv, nb_img, Tmax, fs, Tdiff=Tdiff, orV_rcv=orV_rcv,
def generate_data(output_path='', dataset='adhoc', libri_path='/hdd/data/Librispeech/LibriSpeech', noise_path='/hdd/data/Nonspeech'): assert dataset in ['adhoc', 'fixed'], "dataset can only be adhoc or fixed." if output_path == '': output_path = os.getcwd() data_type = ['train', 'validation', 'test'] for i in range(len(data_type)): # path for config config_path = os.path.join( 'configs', 'MC_Libri_' + dataset + '_' + data_type[i] + '.pkl') # load pickle file with open(config_path, 'rb') as f: configs = pickle.load(f) # sample rate is 16k Hz sr = 16000 # signal length is 4 sec sig_len = 4 # generate and save audio save_dir = os.path.join(output_path, 'MC_Libri_' + dataset, data_type[i]) if not os.path.exists(save_dir): os.makedirs(save_dir) for utt in range(len(configs)): this_config = configs[utt] # load audio files speakers = this_config['speech'] noise = this_config['noise'] spk1, _ = sf.read(os.path.join(libri_path, speakers[0])) spk2, _ = sf.read(os.path.join(libri_path, speakers[1])) noise, _ = sf.read(os.path.join(noise_path, noise)) # calculate signal length according to overlap ratio overlap_ratio = this_config['overlap_ratio'] actual_len = int(sig_len / (2 - overlap_ratio)) * sr overlap = int(actual_len * overlap_ratio) # truncate speech according to start and end indexes start_idx = this_config['start_idx'] end_idx = this_config['end_idx'] spk1 = spk1[start_idx:end_idx] spk2 = spk2[start_idx:end_idx] # rescaling spk2 energy according to relative SNR spk1 = spk1 / np.sqrt(np.sum(spk1**2) + 1e-8) * 1e2 spk2 = spk2 / np.sqrt(np.sum(spk2**2) + 1e-8) * 1e2 spk2 = spk2 * np.power(10, this_config['spk_snr'] / 20.) # load locations and room configs mic_pos = np.asarray(this_config['mic_pos']) spk_pos = np.asarray(this_config['spk_pos']) noise_pos = np.asarray(this_config['noise_pos']) room_size = np.asarray(this_config['room_size']) rt60 = this_config['RT60'] # generate RIR beta = gpuRIR.beta_SabineEstimation(room_size, rt60) nb_img = gpuRIR.t2n(rt60, room_size) spk_rir = gpuRIR.simulateRIR(room_size, beta, spk_pos, mic_pos, nb_img, rt60, sr) noise_rir = gpuRIR.simulateRIR(room_size, beta, noise_pos, mic_pos, nb_img, rt60, sr) # convolve with RIR at different mic if dataset == 'adhoc': nmic = this_config['num_mic'] else: nmic = 6 for mic in range(nmic): spk1_echoic_sig = signal.fftconvolve(spk1, spk_rir[0][mic]) spk2_echoic_sig = signal.fftconvolve(spk2, spk_rir[1][mic]) # align the speakers according to overlap ratio actual_length = len(spk1_echoic_sig) total_length = actual_length * 2 - overlap padding = np.zeros(actual_length - overlap) spk1_echoic_sig = np.concatenate([spk1_echoic_sig, padding]) spk2_echoic_sig = np.concatenate([padding, spk2_echoic_sig]) mixture = spk1_echoic_sig + spk2_echoic_sig # add noise noise = noise[:total_length] if len(noise) < total_length: # repeat noise if necessary num_repeat = total_length // len(noise) res = total_length - num_repeat * len(noise) noise = np.concatenate( [np.concatenate([noise] * num_repeat), noise[:res]]) noise = signal.fftconvolve(noise, noise_rir[0][mic]) # rescaling noise energy noise = noise[:total_length] noise = noise / np.sqrt(np.sum(noise**2) + 1e-8) * np.sqrt( np.sum(mixture**2) + 1e-8) noise = noise / np.power(10, this_config['noise_snr'] / 20.) mixture += noise # save waveforms this_save_dir = os.path.join(save_dir, str(nmic) + 'mic', 'sample' + str(utt + 1)) if not os.path.exists(this_save_dir): os.makedirs(this_save_dir) sf.write( os.path.join(this_save_dir, 'spk1_mic' + str(mic + 1) + '.wav'), spk1_echoic_sig, sr) sf.write( os.path.join(this_save_dir, 'spk2_mic' + str(mic + 1) + '.wav'), spk2_echoic_sig, sr) sf.write( os.path.join(this_save_dir, 'mixture_mic' + str(mic + 1) + '.wav'), mixture, sr) # print progress if (utt + 1) % (len(configs) // 5) == 0: print( "{} configuration, {} set, {:d} out of {:d} utterances generated." .format(dataset, data_type[i], utt + 1, len(configs)))
def generate_data(output_path='', avoid_clipping=0, dataset='adhoc', libri_path='/home/yi/data/Librispeech', noise_path='/home/yi/data/Nonspeech'): assert dataset in ['adhoc', 'fixed'], "dataset can only be adhoc or fixed." if output_path == '': output_path = os.getcwd() data_type = ['train', 'validation', 'test'] for i in range(len(data_type)): # path for config config_path = os.path.join( 'configs', 'MC_Libri_' + dataset + '_' + data_type[i] + '.pkl') # load pickle file with open(config_path, 'rb') as f: configs = pickle.load(f) # sample rate is 16k Hz sr = 16000 # signal length is 4 sec sig_len = 4 for utt in range(len(configs)): this_config = configs[utt] # load audio files speakers = this_config['speech'] noise = this_config['noise'] spk1, _ = sf.read(os.path.join(libri_path, speakers[0])) spk2, _ = sf.read(os.path.join(libri_path, speakers[1])) noise, _ = sf.read(os.path.join(noise_path, noise)) # calculate signal length according to overlap ratio overlap_ratio = this_config['overlap_ratio'] actual_len = int(sig_len / (2 - overlap_ratio) * sr) overlap = int(actual_len * overlap_ratio) # truncate speech according to start and end indexes start_idx = this_config['start_idx'] end_idx = start_idx + actual_len spk1 = spk1[start_idx:end_idx] spk2 = spk2[start_idx:end_idx] # rescaling speaker and noise energy according to relative SNR spk1 = spk1 / np.sqrt(np.sum(spk1**2) + 1e-8) * 1e2 spk2 = spk2 / np.sqrt(np.sum(spk2**2) + 1e-8) * 1e2 spk2 = spk2 * np.power(10, this_config['spk_snr'] / 20.) # repeat noise if necessary noise = noise[:int(sig_len * sr)] if len(noise) < int(sig_len * sr): num_repeat = int(sig_len * sr) // len(noise) res = int(sig_len * sr) - num_repeat * len(noise) noise = np.concatenate( [np.concatenate([noise] * num_repeat), noise[:res]]) # rescale noise energy w.r.t mixture energy noise = noise / np.sqrt(np.sum(noise**2) + 1e-8) * np.sqrt( np.sum((spk1 + spk2)**2) + 1e-8) noise = noise / np.power(10, this_config['noise_snr'] / 20.) # load locations and room configs mic_pos = np.asarray(this_config['mic_pos']) spk_pos = np.asarray(this_config['spk_pos']) noise_pos = np.asarray(this_config['noise_pos']) room_size = np.asarray(this_config['room_size']) rt60 = this_config['RT60'] num_mic = len(mic_pos) # generate RIR beta = gpuRIR.beta_SabineEstimation(room_size, rt60) nb_img = gpuRIR.t2n(rt60, room_size) spk_rir = gpuRIR.simulateRIR(room_size, beta, spk_pos, mic_pos, nb_img, rt60, sr) noise_rir = gpuRIR.simulateRIR(room_size, beta, noise_pos, mic_pos, nb_img, rt60, sr) # convolve with RIR at different mic echoic_spk1 = [] echoic_spk2 = [] echoic_mixture = [] if dataset == 'adhoc': nmic = this_config['num_mic'] else: nmic = 6 for mic in range(nmic): spk1_echoic_sig = signal.fftconvolve(spk1, spk_rir[0][mic]) spk2_echoic_sig = signal.fftconvolve(spk2, spk_rir[1][mic]) noise_echoic_sig = signal.fftconvolve(noise, noise_rir[0][mic]) # align the speakers according to overlap ratio pad_length = int((1 - overlap_ratio) * actual_len) padding = np.zeros(pad_length) spk1_echoic_sig = np.concatenate([spk1_echoic_sig, padding]) spk2_echoic_sig = np.concatenate([padding, spk2_echoic_sig]) # pad or truncate length to 4s if necessary def pad_sig(x): if len(x) < sig_len * sr: zeros = np.zeros(sig_len * sr - len(x)) return np.concatenate([x, zeros]) else: return x[:sig_len * sr] spk1_echoic_sig = pad_sig(spk1_echoic_sig) spk2_echoic_sig = pad_sig(spk2_echoic_sig) noise_echoic_sig = pad_sig(noise_echoic_sig) # sum up for mixture mixture = spk1_echoic_sig + spk2_echoic_sig + noise_echoic_sig if avoid_clipping: # avoid clipping max_scale = np.max([ np.max(np.abs(mixture)), np.max(np.abs(spk1_echoic_sig)), np.max(np.abs(spk2_echoic_sig)) ]) mixture = mixture / max_scale * 0.9 spk1_echoic_sig = spk1_echoic_sig / max_scale * 0.9 spk2_echoic_sig = spk2_echoic_sig / max_scale * 0.9 # save waveforms this_save_dir = os.path.join(output_path, 'MC_Libri_' + dataset, data_type[i], str(num_mic) + 'mic', 'sample' + str(utt + 1)) if not os.path.exists(this_save_dir): os.makedirs(this_save_dir) sf.write( os.path.join(this_save_dir, 'spk1_mic' + str(mic + 1) + '.wav'), spk1_echoic_sig, sr) sf.write( os.path.join(this_save_dir, 'spk2_mic' + str(mic + 1) + '.wav'), spk2_echoic_sig, sr) sf.write( os.path.join(this_save_dir, 'mixture_mic' + str(mic + 1) + '.wav'), mixture, sr) # print progress if (utt + 1) % (len(configs) // 5) == 0: print( "{} configuration, {} set, {:d} out of {:d} utterances generated." .format(dataset, data_type[i], utt + 1, len(configs)))