Example #1
0
def test_virtual_mic():
    from pyutils.iolib.audio import load_wav, save_wav
    mic = VirtualStereoMic()
    mono, rate = load_wav('wav_test/piano.wav')
    mono = mono[:, 0]

    positions = [[float(num) for num in l.strip().split()] for l in open('wav_test/piano_stat_position.txt', 'r')]
    positions = [Position(p[0], p[1], p[2], 'polar') for p in positions]
    source = PositionalSource(mono, positions[0], rate)

    stereo = mic.binauralize([source])
    save_wav('/tmp/output.wav', stereo, rate)
    os.system('play /tmp/output.wav')
    os.remove('/tmp/output.wav')

    positions = [[float(num) for num in l.strip().split()] for l in open('wav_test/piano_mov_position.txt', 'r')]
    positions = [Position(p[0], p[1], p[2], 'polar') for p in positions]
    source = MovingSource(mono, positions, rate)

    stereo = np.zeros((mono.shape[0], 2))
    while source.tic():
        mic.binauralize_frame([source], stereo, source.cur_idx)
    save_wav('/tmp/output.wav', stereo, rate)
    os.system('play /tmp/output.wav')
    os.remove('/tmp/output.wav')
Example #2
0
    def __init__(self, ambi_format, method='projection'):
        self.fmt = ambi_format
        self.method = method

        # Initialize ear position
        self.ear_pos = [Position(0, 0.1, 0, 'cartesian'), Position(0, -0.1, 0, 'cartesian')]
        self.ambi_decoder = AmbiDecoder(self.ear_pos, self.fmt, method=self.method)
Example #3
0
    def __init__(self, ambi_format, mic_pos, method='projection', td_order=5, **kwargs):
        self.source_decoder = VirtualMics(mic_pos, **kwargs)
        self.fmt = ambi_format
        self.method = method

        # Initialize speakers
        if self.method == 'pseudoinv':
            self.speaker_pos = list(map(lambda x: Position(x[0], x[1], x[2], 'cartesian'), get_tDesign(td_order)))
            # self.speaker_pos = list(map(lambda x: Position(x[0], x[1], x[2], 'cartesian'), get_tDesign(self.fmt.order)))
        elif self.method == 'projection':
            speakers_phi = (2. * np.arange(2*self.fmt.num_channels) / float(2*self.fmt.num_channels) - 1.) * np.pi
            self.speaker_pos = list(map(lambda x: Position(x, 0, self.fmt.radius, 'polar'), speakers_phi))
        else:
            raise ValueError('Unknown decoding method. Options: projection and pseudoinv')
        self.n_speakers = len(self.speaker_pos)
        self.ambi_decoder = AmbiDecoder(self.speaker_pos, self.fmt, method=self.method)
    def __call__(self, video, audio, segm, center, fov):
        from utils.ambisonics.decoder import decode_ambix
        from utils.ambisonics.position import Position
        from utils.video360 import rotate_ambix
        from utils.video360 import er2polar
        from utils.ambisonics.binauralizer import DirectAmbisonicBinauralizer, AmbisonicDecoder
        from utils.ambisonics.common import AmbiFormat

        # Crop frames
        self.nfov.setFOV(fov)
        self.nfov.setCenterPoit(center)
        video = [
            Image.fromarray(self.nfov.toNFOV(np.asarray(img))) for img in video
        ]

        # Crop segmentation maps
        if segm is not None:
            segm = self.nfov.toNFOV(segm, interpolation='nn')

        # Decode ambisonics into center point direction
        pos_ambix = Position(*er2polar(*center), 'polar')
        if self.audio_input == 'mono':
            audio = decode_ambix(audio, pos_ambix).astype(np.float32)
        elif self.audio_input == 'stereo':
            audio = rotate_ambix(audio, pos_ambix)[0].astype(np.float32)
            audio = DirectAmbisonicBinauralizer(
                AmbiFormat(), method='projection').binauralize(audio)
        else:
            audio = rotate_ambix(audio, pos_ambix)[0].astype(np.float32)
        return video, audio, segm
Example #5
0
    def __init__(self, ambi_format, method='projection', use_hrtfs=False, cipic_dir=None):
        self.source_bin = SourceBinauralizer(cipic_dir=cipic_dir, use_hrtfs=use_hrtfs)
        self.fmt = ambi_format
        self.method = method

        # Initialize speakers
        if self.method == 'pseudoinv':
            self.speaker_pos = map(lambda x: Position(x[0], x[1], x[2], 'cartesian'), get_tDesign(self.fmt.order))
            map(lambda p: p.set_radius(self.fmt.radius), self.speaker_pos)
            # speakers_phi = (2. * np.arange(2*self.fmt.num_channels) / float(2*self.fmt.num_channels) - 1.) * np.pi
            # self.speaker_pos = map(lambda x: Position(x, 0, self.fmt.radius, 'polar'), speakers_phi)
        elif self.method == 'projection':
            speakers_phi = (2. * np.arange(2*self.fmt.num_channels) / float(2*self.fmt.num_channels) - 1.) * np.pi
            self.speaker_pos = list(map(lambda x: Position(x, 0, self.fmt.radius, 'polar'), speakers_phi))
        else:
            raise ValueError('Unknown decoding method. Options: projection and pseudoinv')
        self.n_speakers = len(self.speaker_pos)
        self.ambi_decoder = AmbiDecoder(self.speaker_pos, self.fmt, method=self.method)
Example #6
0
class VirtualStereoMic(object):
    def __init__(self, radius=0.1):
        self.radius = radius
        self.lmic_pos = Position(0, radius, 0, 'cartesian')
        self.rmic_pos = Position(0, -radius, 0, 'cartesian')

    def binauralize(self, sources):
        if isinstance(sources, PositionalSource):
            sources = [sources]

        l_signal, r_signal = 0, 0.
        for src in sources:
            l_dist = np.sqrt(((src.position.coords('cartesian') - self.lmic_pos.coords('cartesian'))**2).sum())
            r_dist = np.sqrt(((src.position.coords('cartesian') - self.rmic_pos.coords('cartesian'))**2).sum())

            # Time delay
            l_delay, r_delay = int(l_dist / C * src.sample_rate), int(r_dist / C * src.sample_rate)

            # Attenuation is frequency dependent, but lets simplify.
            l_attn, r_attn = 1 / (1. + l_dist), 1 / (1. + r_dist)

            l_signal += l_attn * shift(src.signal, l_delay, cval=0.) / len(sources)
            r_signal += r_attn * shift(src.signal, r_delay, cval=0.) / len(sources)

        return np.stack((l_signal, r_signal), axis=1)

    def binauralize_frame(self, sources, output, frame_no):
        if isinstance(sources, PositionalSource):
            sources = [sources]

        for src in sources:
            l_dist = np.sqrt(((src.position.coords('cartesian') - self.lmic_pos.coords('cartesian'))**2).sum())
            r_dist = np.sqrt(((src.position.coords('cartesian') - self.rmic_pos.coords('cartesian'))**2).sum())

            # Time delay
            l_delay, r_delay = int(l_dist / C * src.sample_rate), int(r_dist / C * src.sample_rate)

            # Attenuation is frequency dependent, but lets simplify.
            l_attn, r_attn = 1 / (1. + l_dist), 1 / (1. + r_dist)

            if frame_no-l_delay >= 0:
                output[frame_no, 0] += l_attn * src.signal[frame_no-l_delay] / len(sources)
            if frame_no-r_delay >= 0:
                output[frame_no, 1] += r_attn * src.signal[frame_no-r_delay] / len(sources)
Example #7
0
def audio_crop_freq_sep(ambix, center, sigma=9, thr=0.7):
    import librosa
    from scipy import ndimage
    center_stft = librosa.core.stft(project_audio(ambix, center=center)[0])
    center_stft_smooth = ndimage.gaussian_filter(np.abs(center_stft), sigma=sigma)

    other_stft_smooth = []
    for phi in np.linspace(-pi, pi, 16):
        for nu in np.linspace(-pi / 2, pi / 2, 8):
            p = Position(phi, nu, 1., 'polar')
            if (p.coords('cartesian') * center.coords('cartesian')).sum() > math.cos(pi / 2):
                continue
            stft = librosa.core.stft(project_audio(ambix, center=p)[0])
            other_stft_smooth += [ndimage.gaussian_filter(np.abs(stft), sigma=sigma)]

    other_stft_smooth = np.stack(other_stft_smooth, 0)
    rank = (np.abs(center_stft_smooth[np.newaxis]) > np.abs(other_stft_smooth)).sum(0) / other_stft_smooth.shape[0]

    stft = librosa.core.stft(project_audio(ambix, center=center)[0])
    stft[rank < thr] = 0
    wav = librosa.core.istft(stft)
    return wav
    def __init__(self, ambi_order=1, window=None, angular_res=20.0):
        self.angular_res = angular_res
        self.phi_mesh, self.nu_mesh = spherical_mesh(angular_res)
        self.frame_shape = self.phi_mesh.shape
        self.window = window
        mesh_p = [
            Position(phi, nu, 1., 'polar') for phi, nu in zip(
                self.phi_mesh.reshape(-1), self.nu_mesh.reshape(-1))
        ]

        # Setup decoder
        self.decoder = AmbiDecoder(mesh_p,
                                   AmbiFormat(ambi_order),
                                   method='projection')
Example #9
0
def ambix_power_map(ambix, audio_rate=22050, outp_rate=10, angular_res=5.0):
    from utils.ambisonics.distance import spherical_mesh
    from utils.ambisonics.decoder import AmbiDecoder, AmbiFormat
    from utils.ambisonics.position import Position
    phi_mesh, nu_mesh = spherical_mesh(angular_res)
    mesh_p = [Position(phi, nu, 1., 'polar') for phi, nu in zip(phi_mesh.reshape(-1), nu_mesh.reshape(-1))]
    ambi_order = math.sqrt(ambix.shape[0]) - 1
    decoder = AmbiDecoder(mesh_p, AmbiFormat(ambi_order=int(ambi_order), sample_rate=audio_rate), method='projection')

    # Compute RMS at each speaker
    rms = []
    window_size = int(audio_rate / outp_rate)
    for t in np.arange(0, ambix.shape[1], window_size):
        chunk = ambix[:, int(t):int(t) + window_size]
        decoded = decoder.decode(chunk)
        rms += [np.flipud(np.sqrt(np.mean(decoded ** 2, 1)).reshape(phi_mesh.shape))]
    return np.stack(rms, 0)
    def __call__(self, frames, sig):
        from utils.ambisonics.decoder import decode_ambix
        from utils.ambisonics.position import Position
        from utils.video360 import er2polar

        # Crop frames
        center_point = np.array(
            [random.uniform(0., 1.),
             random.uniform(0.33, 0.66)])
        self.nfov.setCenterPoit(center_point)
        video = [
            Image.fromarray(self.nfov.toNFOV(np.asarray(img)))
            for img in frames
        ]

        # Decode ambisonics into center point direction
        pos_ambix = Position(*er2polar(*center_point), 'polar')
        audio = decode_ambix(sig, pos_ambix).astype(np.float32)

        return video, audio
Example #11
0
    def __init__(self, data, rate=22050, window=0.1, angular_res=2.0):
        self.window = window
        self.angular_res = angular_res
        self.data = data
        self.phi_mesh, self.nu_mesh = spherical_mesh(angular_res)
        mesh_p = [
            Position(phi, nu, 1., 'polar') for phi, nu in zip(
                self.phi_mesh.reshape(-1), self.nu_mesh.reshape(-1))
        ]

        # Setup decoder
        ambi_order = np.sqrt(data.shape[0]) - 1
        self.decoder = AmbiDecoder(mesh_p,
                                   AmbiFormat(ambi_order=ambi_order,
                                              sample_rate=rate),
                                   method='projection')

        # Compute spherical energy averaged over consecutive chunks of "window" secs
        self.window_frames = int(self.window * rate)
        self.n_frames = data.shape[1] / self.window_frames
        self.output_rate = float(rate) / self.window_frames
        self.frame_dims = self.phi_mesh.shape
        self.cur_frame = -1
Example #12
0
    def __init__(self, pos):
        from utils.ambisonics.decoder import AmbiDecoder
        from utils.ambisonics.position import Position

        pos = [Position(*er2polar(x, y), 'polar') for x, y in pos]
        self.decoder = AmbiDecoder(pos, method='projection')
Example #13
0
    pos1_stft_smooth = ndimage.gaussian_filter(np.abs(pos1_stft), sigma=sigma)

    pos2_stft = librosa.core.stft(project_audio(ambix, center=pos2)[0])
    pos2_stft_smooth = ndimage.gaussian_filter(np.abs(pos2_stft), sigma=sigma)

    mask = (pos1_stft_smooth > pos2_stft_smooth).astype(float)
    pos1_wav = librosa.core.istft(pos1_stft * mask)
    pos2_wav = librosa.core.istft(pos2_stft * (1. - mask))
    return pos1_wav, pos2_wav


if __name__ == '__main__':
    from utils.ioutils.audio import AudioReader, AudioWriter
    from utils.ambisonics.position import Position
    from math import pi

    yid = 'bb5eETSspVI-213'
    # yid = 'l5M8AvP6rvs-518'

    for rot_theta in [pi/2.]:
        for rot_pitch in [pi/6.]:
            audio_fn = f'rot-yaw{rot_theta:.3f}-pitch{rot_pitch:.3f}.m4a'
            video_fn = f'rot-yaw{rot_theta:.3f}-pitch{rot_pitch:.3f}.mp4'
            out_fn = f'out-yaw{rot_theta:.3f}-pitch{rot_pitch:.3f}.mp4'
            pos = Position(rot_theta, rot_pitch, 1., c_type='polar')
            ambix = AudioReader(f'data/spatial-audio-db/audio/{yid}.m4a').read(0., 5., 24000)
            ambix_rot = rotate_ambix(ambix, pos)[0]

            AudioWriter(audio_fn, 24000).write(ambix_rot)
            rotate_video(f'data/spatial-audio-db/video/{yid}.mp4', video_fn, pos)
            overlay_map(audio_fn, video_fn, out_fn)
Example #14
0
 def __init__(self, radius=0.1):
     self.radius = radius
     self.lmic_pos = Position(0, radius, 0, 'cartesian')
     self.rmic_pos = Position(0, -radius, 0, 'cartesian')