def test_virtual_mic(): from pyutils.iolib.audio import load_wav, save_wav mic = VirtualStereoMic() mono, rate = load_wav('wav_test/piano.wav') mono = mono[:, 0] positions = [[float(num) for num in l.strip().split()] for l in open('wav_test/piano_stat_position.txt', 'r')] positions = [Position(p[0], p[1], p[2], 'polar') for p in positions] source = PositionalSource(mono, positions[0], rate) stereo = mic.binauralize([source]) save_wav('/tmp/output.wav', stereo, rate) os.system('play /tmp/output.wav') os.remove('/tmp/output.wav') positions = [[float(num) for num in l.strip().split()] for l in open('wav_test/piano_mov_position.txt', 'r')] positions = [Position(p[0], p[1], p[2], 'polar') for p in positions] source = MovingSource(mono, positions, rate) stereo = np.zeros((mono.shape[0], 2)) while source.tic(): mic.binauralize_frame([source], stereo, source.cur_idx) save_wav('/tmp/output.wav', stereo, rate) os.system('play /tmp/output.wav') os.remove('/tmp/output.wav')
def __init__(self, ambi_format, method='projection'): self.fmt = ambi_format self.method = method # Initialize ear position self.ear_pos = [Position(0, 0.1, 0, 'cartesian'), Position(0, -0.1, 0, 'cartesian')] self.ambi_decoder = AmbiDecoder(self.ear_pos, self.fmt, method=self.method)
def __init__(self, ambi_format, mic_pos, method='projection', td_order=5, **kwargs): self.source_decoder = VirtualMics(mic_pos, **kwargs) self.fmt = ambi_format self.method = method # Initialize speakers if self.method == 'pseudoinv': self.speaker_pos = list(map(lambda x: Position(x[0], x[1], x[2], 'cartesian'), get_tDesign(td_order))) # self.speaker_pos = list(map(lambda x: Position(x[0], x[1], x[2], 'cartesian'), get_tDesign(self.fmt.order))) elif self.method == 'projection': speakers_phi = (2. * np.arange(2*self.fmt.num_channels) / float(2*self.fmt.num_channels) - 1.) * np.pi self.speaker_pos = list(map(lambda x: Position(x, 0, self.fmt.radius, 'polar'), speakers_phi)) else: raise ValueError('Unknown decoding method. Options: projection and pseudoinv') self.n_speakers = len(self.speaker_pos) self.ambi_decoder = AmbiDecoder(self.speaker_pos, self.fmt, method=self.method)
def __call__(self, video, audio, segm, center, fov): from utils.ambisonics.decoder import decode_ambix from utils.ambisonics.position import Position from utils.video360 import rotate_ambix from utils.video360 import er2polar from utils.ambisonics.binauralizer import DirectAmbisonicBinauralizer, AmbisonicDecoder from utils.ambisonics.common import AmbiFormat # Crop frames self.nfov.setFOV(fov) self.nfov.setCenterPoit(center) video = [ Image.fromarray(self.nfov.toNFOV(np.asarray(img))) for img in video ] # Crop segmentation maps if segm is not None: segm = self.nfov.toNFOV(segm, interpolation='nn') # Decode ambisonics into center point direction pos_ambix = Position(*er2polar(*center), 'polar') if self.audio_input == 'mono': audio = decode_ambix(audio, pos_ambix).astype(np.float32) elif self.audio_input == 'stereo': audio = rotate_ambix(audio, pos_ambix)[0].astype(np.float32) audio = DirectAmbisonicBinauralizer( AmbiFormat(), method='projection').binauralize(audio) else: audio = rotate_ambix(audio, pos_ambix)[0].astype(np.float32) return video, audio, segm
def __init__(self, ambi_format, method='projection', use_hrtfs=False, cipic_dir=None): self.source_bin = SourceBinauralizer(cipic_dir=cipic_dir, use_hrtfs=use_hrtfs) self.fmt = ambi_format self.method = method # Initialize speakers if self.method == 'pseudoinv': self.speaker_pos = map(lambda x: Position(x[0], x[1], x[2], 'cartesian'), get_tDesign(self.fmt.order)) map(lambda p: p.set_radius(self.fmt.radius), self.speaker_pos) # speakers_phi = (2. * np.arange(2*self.fmt.num_channels) / float(2*self.fmt.num_channels) - 1.) * np.pi # self.speaker_pos = map(lambda x: Position(x, 0, self.fmt.radius, 'polar'), speakers_phi) elif self.method == 'projection': speakers_phi = (2. * np.arange(2*self.fmt.num_channels) / float(2*self.fmt.num_channels) - 1.) * np.pi self.speaker_pos = list(map(lambda x: Position(x, 0, self.fmt.radius, 'polar'), speakers_phi)) else: raise ValueError('Unknown decoding method. Options: projection and pseudoinv') self.n_speakers = len(self.speaker_pos) self.ambi_decoder = AmbiDecoder(self.speaker_pos, self.fmt, method=self.method)
class VirtualStereoMic(object): def __init__(self, radius=0.1): self.radius = radius self.lmic_pos = Position(0, radius, 0, 'cartesian') self.rmic_pos = Position(0, -radius, 0, 'cartesian') def binauralize(self, sources): if isinstance(sources, PositionalSource): sources = [sources] l_signal, r_signal = 0, 0. for src in sources: l_dist = np.sqrt(((src.position.coords('cartesian') - self.lmic_pos.coords('cartesian'))**2).sum()) r_dist = np.sqrt(((src.position.coords('cartesian') - self.rmic_pos.coords('cartesian'))**2).sum()) # Time delay l_delay, r_delay = int(l_dist / C * src.sample_rate), int(r_dist / C * src.sample_rate) # Attenuation is frequency dependent, but lets simplify. l_attn, r_attn = 1 / (1. + l_dist), 1 / (1. + r_dist) l_signal += l_attn * shift(src.signal, l_delay, cval=0.) / len(sources) r_signal += r_attn * shift(src.signal, r_delay, cval=0.) / len(sources) return np.stack((l_signal, r_signal), axis=1) def binauralize_frame(self, sources, output, frame_no): if isinstance(sources, PositionalSource): sources = [sources] for src in sources: l_dist = np.sqrt(((src.position.coords('cartesian') - self.lmic_pos.coords('cartesian'))**2).sum()) r_dist = np.sqrt(((src.position.coords('cartesian') - self.rmic_pos.coords('cartesian'))**2).sum()) # Time delay l_delay, r_delay = int(l_dist / C * src.sample_rate), int(r_dist / C * src.sample_rate) # Attenuation is frequency dependent, but lets simplify. l_attn, r_attn = 1 / (1. + l_dist), 1 / (1. + r_dist) if frame_no-l_delay >= 0: output[frame_no, 0] += l_attn * src.signal[frame_no-l_delay] / len(sources) if frame_no-r_delay >= 0: output[frame_no, 1] += r_attn * src.signal[frame_no-r_delay] / len(sources)
def audio_crop_freq_sep(ambix, center, sigma=9, thr=0.7): import librosa from scipy import ndimage center_stft = librosa.core.stft(project_audio(ambix, center=center)[0]) center_stft_smooth = ndimage.gaussian_filter(np.abs(center_stft), sigma=sigma) other_stft_smooth = [] for phi in np.linspace(-pi, pi, 16): for nu in np.linspace(-pi / 2, pi / 2, 8): p = Position(phi, nu, 1., 'polar') if (p.coords('cartesian') * center.coords('cartesian')).sum() > math.cos(pi / 2): continue stft = librosa.core.stft(project_audio(ambix, center=p)[0]) other_stft_smooth += [ndimage.gaussian_filter(np.abs(stft), sigma=sigma)] other_stft_smooth = np.stack(other_stft_smooth, 0) rank = (np.abs(center_stft_smooth[np.newaxis]) > np.abs(other_stft_smooth)).sum(0) / other_stft_smooth.shape[0] stft = librosa.core.stft(project_audio(ambix, center=center)[0]) stft[rank < thr] = 0 wav = librosa.core.istft(stft) return wav
def __init__(self, ambi_order=1, window=None, angular_res=20.0): self.angular_res = angular_res self.phi_mesh, self.nu_mesh = spherical_mesh(angular_res) self.frame_shape = self.phi_mesh.shape self.window = window mesh_p = [ Position(phi, nu, 1., 'polar') for phi, nu in zip( self.phi_mesh.reshape(-1), self.nu_mesh.reshape(-1)) ] # Setup decoder self.decoder = AmbiDecoder(mesh_p, AmbiFormat(ambi_order), method='projection')
def ambix_power_map(ambix, audio_rate=22050, outp_rate=10, angular_res=5.0): from utils.ambisonics.distance import spherical_mesh from utils.ambisonics.decoder import AmbiDecoder, AmbiFormat from utils.ambisonics.position import Position phi_mesh, nu_mesh = spherical_mesh(angular_res) mesh_p = [Position(phi, nu, 1., 'polar') for phi, nu in zip(phi_mesh.reshape(-1), nu_mesh.reshape(-1))] ambi_order = math.sqrt(ambix.shape[0]) - 1 decoder = AmbiDecoder(mesh_p, AmbiFormat(ambi_order=int(ambi_order), sample_rate=audio_rate), method='projection') # Compute RMS at each speaker rms = [] window_size = int(audio_rate / outp_rate) for t in np.arange(0, ambix.shape[1], window_size): chunk = ambix[:, int(t):int(t) + window_size] decoded = decoder.decode(chunk) rms += [np.flipud(np.sqrt(np.mean(decoded ** 2, 1)).reshape(phi_mesh.shape))] return np.stack(rms, 0)
def __call__(self, frames, sig): from utils.ambisonics.decoder import decode_ambix from utils.ambisonics.position import Position from utils.video360 import er2polar # Crop frames center_point = np.array( [random.uniform(0., 1.), random.uniform(0.33, 0.66)]) self.nfov.setCenterPoit(center_point) video = [ Image.fromarray(self.nfov.toNFOV(np.asarray(img))) for img in frames ] # Decode ambisonics into center point direction pos_ambix = Position(*er2polar(*center_point), 'polar') audio = decode_ambix(sig, pos_ambix).astype(np.float32) return video, audio
def __init__(self, data, rate=22050, window=0.1, angular_res=2.0): self.window = window self.angular_res = angular_res self.data = data self.phi_mesh, self.nu_mesh = spherical_mesh(angular_res) mesh_p = [ Position(phi, nu, 1., 'polar') for phi, nu in zip( self.phi_mesh.reshape(-1), self.nu_mesh.reshape(-1)) ] # Setup decoder ambi_order = np.sqrt(data.shape[0]) - 1 self.decoder = AmbiDecoder(mesh_p, AmbiFormat(ambi_order=ambi_order, sample_rate=rate), method='projection') # Compute spherical energy averaged over consecutive chunks of "window" secs self.window_frames = int(self.window * rate) self.n_frames = data.shape[1] / self.window_frames self.output_rate = float(rate) / self.window_frames self.frame_dims = self.phi_mesh.shape self.cur_frame = -1
def __init__(self, pos): from utils.ambisonics.decoder import AmbiDecoder from utils.ambisonics.position import Position pos = [Position(*er2polar(x, y), 'polar') for x, y in pos] self.decoder = AmbiDecoder(pos, method='projection')
pos1_stft_smooth = ndimage.gaussian_filter(np.abs(pos1_stft), sigma=sigma) pos2_stft = librosa.core.stft(project_audio(ambix, center=pos2)[0]) pos2_stft_smooth = ndimage.gaussian_filter(np.abs(pos2_stft), sigma=sigma) mask = (pos1_stft_smooth > pos2_stft_smooth).astype(float) pos1_wav = librosa.core.istft(pos1_stft * mask) pos2_wav = librosa.core.istft(pos2_stft * (1. - mask)) return pos1_wav, pos2_wav if __name__ == '__main__': from utils.ioutils.audio import AudioReader, AudioWriter from utils.ambisonics.position import Position from math import pi yid = 'bb5eETSspVI-213' # yid = 'l5M8AvP6rvs-518' for rot_theta in [pi/2.]: for rot_pitch in [pi/6.]: audio_fn = f'rot-yaw{rot_theta:.3f}-pitch{rot_pitch:.3f}.m4a' video_fn = f'rot-yaw{rot_theta:.3f}-pitch{rot_pitch:.3f}.mp4' out_fn = f'out-yaw{rot_theta:.3f}-pitch{rot_pitch:.3f}.mp4' pos = Position(rot_theta, rot_pitch, 1., c_type='polar') ambix = AudioReader(f'data/spatial-audio-db/audio/{yid}.m4a').read(0., 5., 24000) ambix_rot = rotate_ambix(ambix, pos)[0] AudioWriter(audio_fn, 24000).write(ambix_rot) rotate_video(f'data/spatial-audio-db/video/{yid}.mp4', video_fn, pos) overlay_map(audio_fn, video_fn, out_fn)
def __init__(self, radius=0.1): self.radius = radius self.lmic_pos = Position(0, radius, 0, 'cartesian') self.rmic_pos = Position(0, -radius, 0, 'cartesian')