Example #1
0
    def save_main_beam(self, rs_x, rs_y, sp_x, sp_y, is_circular, mic_num, c_x, c_y, i_m_d, angle, room_frequency,
                       freques, room_temp=None, room_humidity=None, is_airAbsorption=None):
        # Create a rs_x by rs_y metres shoe box room
        room = pra.ShoeBox([rs_x, rs_y], fs=room_frequency, temperature=room_temp, humidity=room_humidity,
                           air_absorption=is_airAbsorption)

        # Add a source somewhere in the room
        room.add_source([sp_x, sp_y])

        # Create a linear array beamformer with 4 microphones
        # with angle 0 degrees and inter mic distance 10 cm
        if (is_circular):
            R = pra.circular_2D_array([c_x, c_y], mic_num, angle, i_m_d)
        else:
            R = pra.linear_2D_array([c_x, c_y], mic_num, angle, i_m_d)

        room.add_microphone_array(pra.Beamformer(R, room.fs))

        # Now compute the delay and sum weights for the beamformer
        room.mic_array.rake_delay_and_sum_weights(room.sources[0][:1])

        # plot the room and resulting beamformer
        room.plot(freq=freques, img_order=0)
        # plt.show()
        plt.savefig('./fig.png')
Example #2
0
def shoebox_rir(room_dim, source, mic):

    # Some simulation parameters
    Fs = 8000
    t0 = 1./(Fs*np.pi*1e-2)  # starting time function of sinc decay in RIR response
    absorption = 0.90
    max_order_sim = 10

    # create a microphone array
    R = pra.linear2DArray(mic, 1, 0, 1) 
    mics = pra.Beamformer(R, Fs)

    # create the room with sources and mics
    room1 = pra.Room.shoeBox2D(
        [0,0],
        room_dim,
        Fs,
        t0 = t0,
        max_order=max_order_sim,
        absorption=absorption,
        sigma2_awgn=0)

    # add source and interferer
    room1.addSource(source)
    room1.addMicrophoneArray(mics)

    room1.compute_RIR()
    h = room1.rir[0][0]

    return h
Example #3
0
def DAB_generate(source_audio, out_folder, name):

    shoebox = pra.ShoeBox(
        room_dimensions,
        absorption=wall_absorption,
        fs=fs,
        max_order=15,
    )

    # number of microphones
    M = 4

    source_position = np.array([
        random.uniform(0, room_dimensions[0]),
        random.uniform(0, room_dimensions[1])
    ])

    distances = np.random.randint(1, 20, M)

    mic_pos = []
    for m in range(M):
        mic_distance = distances[m]
        mic_m = guess_microphone(
            source_position, mic_distance
        )  # random way: guess microphone position until it's in the room: very long time for small rooms
        mic_pos.append(mic_m)

    out_mic_file = os.path.join(out_folder, 'log_%s.txt' % name)

    if os.path.exists(out_mic_file):
        os.remove(out_mic_file)
    f1 = open(out_mic_file, 'w')
    for l in range(M):
        f1.write("%s, %f\n" % (str(mic_pos[l]), distances[l]))

    Lg_t = 0.100  # filter size in seconds
    Lg = np.ceil(Lg_t * fs)  # in samples
    fft_len = 512
    mics = pra.Beamformer(np.asarray(mic_pos).T, shoebox.fs, N=fft_len, Lg=Lg)

    shoebox.add_source(source_position, signal=source_audio)
    shoebox.add_microphone_array(mics)

    shoebox.compute_rir()
    shoebox.simulate()

    # ADDING NOISE AND SAVING

    for n in range(M):
        signal = np.asarray(shoebox.mic_array.signals[n, :], dtype=float)
        signal = pra.utilities.normalize(signal, bits=16)
        mixed_signal = add_noise(source_audio, signal)
        mixed_signal = np.array(mixed_signal, dtype=np.int16)
        mixed_file = os.path.join(out_folder, 'mix%d_%s' % (n, name))
        pp.write_audio(mixed_file, mixed_signal, fs)
Example #4
0
def beamformed_das(comb, people_num, sr=16000):
    f1 = comb[0]
    f2 = comb[1]
    # def beamformed_das(f1, f2, people_num, sr=16000):
    f1_data = f1['data']
    f2_data = f2['data']
    signal_len = len(f1['data'])
    distance = 1.5

    # azimuth = np.array([math.atan2(1.5, 0.5), math.atan2(1.5, -0.5)])
    azimuth = np.array([
        90.,
        270.,
    ]) * np.pi / 180

    # centre = [2, 1.5]
    room_dim = np.r_[4, 6]
    room = pra.ShoeBox(room_dim, fs=sr)
    echo = pra.linear_2D_array(center=(room_dim / 2), M=5, phi=0, d=0.5)
    echo = np.concatenate((echo, np.array((room_dim / 2), ndmin=2).T), axis=1)
    mics = pra.Beamformer(echo, room.fs)
    room.add_microphone_array(mics)

    # room.add_source(np.array([1.5, 4.5]), delay=0., signal=f1_data)
    # room.add_source(np.array([2.5, 4.5]), delay=0., signal=f2_data[:len(f1_data)])
    signals = [f1_data, f2_data]
    for i, ang in enumerate(azimuth):
        source_location = room_dim / 2 + distance * np.r_[np.cos(ang),
                                                          np.sin(ang)]
        source_signal = signals[i]
        room.add_source(source_location,
                        signal=source_signal[:signal_len],
                        delay=0)

    mics.rake_delay_and_sum_weights(room.sources[0][:1])

    # room.plot(freq=[300, 400, 500, 1000, 2000, 4000], img_order=0)
    # plt.show()
    # ax.legend(['300', '400', '500', '1000', '2000', '4000'])
    # fig.set_size_inches(20, 8)

    room.compute_rir()
    room.simulate()

    filename = 'beamformeded_%05d-%05d' % (f1['filename'],
                                           f2['filename']) + '.wav'

    with open(TXT_PATH + 'build_beamformeded.txt', 'a') as f:
        f.write(filename)
        f.write('\n')

    for i in range(5):
        wavfile.write(MICS_PATH + 'mic%d/' % (i + 1) + filename, sr,
                      room.mic_array.signals[i, :])
Example #5
0
def mic_rever_generator(room_size, target_location, target, fs,
                        microphone_array, amplifier, absorption_value):
    '''
    This function is used to implement single source microphone array reverberation speech generator.
    
    Usage:  mic_rever_generator(room_size, target_location, target, fs, microphone_array, amplifier, absorption_value)
               
        room_size                  - the size of room [length, width, high]
        target_location            - the location of target speech [x, y, z]
        target                     - the array of target speech file
        fs                         - sampling frequency
        microphone_array           - the location of microphone array
        amplifier                  - the multiple of microphone's built-in amplifier
        absorption_value           - absorption value of room wall
    
    Example call:
        clean_rever = mic_rever_generator(room_size, target_location, target, fs, microphone_array, amplifier, absorption_value)
    
    References:
    mircophone array speech generator release 0.1
    
    Author: Rui Cheng
    '''

    # create the room
    room = pra.ShoeBox(room_size,
                       fs=fs,
                       absorption=absorption_value,
                       max_order=17)

    room.add_source(target_location, signal=target, delay=0)
    #room.add_source([3.5, 3.0, 1.76], signal=interf[:len(target)], delay=0)

    # add microphone array
    R = microphone_array
    fft_len = 512
    Lg_t = 0.100
    Lg = np.ceil(Lg_t * room.fs)
    mic_array = pra.Beamformer(R, room.fs, N=fft_len, Lg=Lg)
    room.add_microphone_array(mic_array)

    # create the room impulse response
    # compute image sources
    room.image_source_model(use_libroom=True)

    # microphone speech
    room.simulate()

    # clean speech in each channel
    clean_rever = amplifier * room.mic_array.signals.astype("int16")

    # return
    return clean_rever
Example #6
0
def Beamformer_Distortionless(mixed: np.array, state: dict, options: dict):
    # Get options
    nSources = options['nSources']
    stft_size = options['stft_size'] if 'stft_size' in options else 1024
    delay = options['delay'] if 'delay' in options else 0.05
    nPaths = options['nPaths'] if 'nPaths' in options else 1
    FD = options['FD'] if 'FD' in options else False

    if 'room_object' in options:
        room = options['room_object']
    else:
        warnings.warn(
            'room_object is required in algorithm options for beamforming'.
            format(nSources))
        return np.zeros((nSources, mixed.shape[1])), state

    fs = room.fs

    # Check number of sources to be equal to 2
    if nSources != 2:
        warnings.warn(
            'Perceptual beamformer is implemented only for 2 sources (instead=%d was requested)'
            .format(nSources))
        return np.zeros((nSources, mixed.shape[1])), state

    # Create beamformer object
    bmfr = pra.Beamformer(room.mic_array.R, fs, N=stft_size)

    # "Record" mixed data with beamformer
    bmfr.record(mixed, fs)

    # Create filters that point to source 1
    bmfr.rake_distortionless_filters(room.sources[0][0:nPaths],
                                     room.sources[1][0:nPaths],
                                     room.sigma2_awgn *
                                     np.eye(bmfr.Lg * bmfr.M),
                                     delay=delay)
    s1 = bmfr.process(FD)

    # Create filters that point to source 2
    bmfr.rake_distortionless_filters(room.sources[1][0:nPaths],
                                     room.sources[0][0:nPaths],
                                     room.sigma2_awgn *
                                     np.eye(bmfr.Lg * bmfr.M),
                                     delay=delay)
    s2 = bmfr.process(FD)

    return np.stack([s1, s2], axis=0), state
Example #7
0
 def __init__(self):
     super(Room, self).__init__()
     # Create a ~4 by ~6 metres shoe box room
     rx = random.uniform(3.8, 4.2)
     ry = random.uniform(6.8, 7.2)
     self.room = pra.ShoeBox([rx, ry], fs=16000)
     # Create 2 microphones
     # 20 cm between
     self.x = random.uniform(0.5, 3.5)
     self.my = random.uniform(0.5, 1.5)
     R = np.c_[[self.x - 0.1, self.my],  # mic 1
               [self.x + 0.1, self.my],  # mic 2
               ]
     self.room.add_microphone_array(pra.Beamformer(R, self.room.fs))
     self.delay = 0
     self.rate = 16000
Example #8
0
Lg_t = 0.100  # filter size in seconds
Lg = np.ceil(Lg_t * Fs)  # filter size in samples
alphas = np.arange(0.1, 1, 0.05)
source = np.array([1, 4.5])
interferer = np.array([3.5, 3.])
radius = 0.15

roomDim = [8, 6]
center = [1, 3.5]
fft_len = 512
echo = pra.circular_2D_array(center=center, M=6, phi0=0, radius=radius)
echo = np.concatenate((echo, np.array(center, ndmin=2).T), axis=1)

for alpha in alphas:
    room_bf = pra.ShoeBox(roomDim, fs=Fs, max_order=64, absorption=alpha)
    mics = pra.Beamformer(echo, room_bf.fs, N=fft_len, Lg=Lg)
    room_bf.add_microphone_array(mics)
    room_bf.add_source(source, delay=0., signal=xtone)
    room_bf.add_source(interferer, delay=0, signal=silence)

    # Compute DAS weights
    mics.rake_delay_and_sum_weights(room_bf.sources[0][:1])

    #
    # Do Beamforming
    room_bf.image_source_model(use_libroom=True)
    room_bf.compute_rir()
    room_bf.simulate()

    #
    signal_das = mics.process(FD=False)
Example #9
0
"""
This example shows how to create delay and sum beamformers
"""
from __future__ import print_function, division

import numpy as np
import matplotlib.pyplot as plt
import pyroomacoustics as pra

# Create a 4 by 6 metres shoe box room
room = pra.ShoeBox([4, 6])

# Add a source somewhere in the room
room.add_source([2.5, 4.5])

# Create a linear array beamformer with 4 microphones
# with angle 0 degrees and inter mic distance 10 cm
R = pra.linear_2D_array([2, 1.5], 4, 0, 0.04)
room.add_microphone_array(pra.Beamformer(R, room.fs))

# Now compute the delay and sum weights for the beamformer
room.mic_array.rake_delay_and_sum_weights(room.sources[0][:1])

# plot the room and resulting beamformer
room.plot(freq=[1000, 2000, 4000, 8000], img_order=0)
plt.show()
Example #10
0
def process_experiment_max_sinr(SIR, mic, args):

    nfft = args.nfft
    vad_guard = args.vad_guard
    if args.thresh is None:
        vad_thresh = thresh_opt[SIR]
    else:
        vad_thresh = args.thresh

    # read_in the mix signals
    fs_led, leds = wavfile.read(
        file_pattern.format('camera_leds_zero_hold', 'mix', SIR))
    fs_snd, audio = wavfile.read(
        file_pattern.format(mic_choices[mic], 'mix', SIR))
    assert fs_led == fs_snd

    # read in the ref signals
    r, noise_ref = wavfile.read(
        file_pattern.format(mic_choices[mic], 'noise_ref', SIR))
    assert r == fs_snd
    r, speech_ref = wavfile.read(file_speech_ref.format(mic_choices[mic]))
    assert r == fs_snd
    r, leds_ref = wavfile.read(file_speech_ref.format('camera_leds_zero_hold'))
    assert r == fs_snd

    # In case of objective evaluation, we do an artificial mix
    if args.synth_mix:
        audio = noise_ref + speech_ref

    # get the geometry information to get nice plots.
    mics_loc = np.array(protocol['geometry']['microphones'][mic]['reference'])
    noise_loc = protocol['geometry']['speakers']['locations'][0]
    speech_loc = protocol['geometry']['speakers']['locations'][1]

    # the directions of arrival
    theta_speech = 0
    p0 = speech_loc - mics_loc
    p1 = noise_loc - mics_loc
    theta_noise = np.arccos(np.inner(p0, p1) / la.norm(p0) / la.norm(p1))
    print('Source separation', theta_noise / np.pi * 180)

    if mic == 'pyramic':
        I = list(range(8, 16)) + list(range(24, 32)) + list(range(
            40, 48))  # flat part
        #I = list(range(24,32)) + list(range(40,48)) # flat part
        #I = list(range(8,16))
        #I = list(range(48))
        audio = audio[:, I]
        noise_ref = noise_ref[:, I].copy()
        speech_ref = speech_ref[:, I].copy()
        mics_positions = mics_geom['pyramic'][I].copy()
        # place in room 2-806
        mics_positions -= np.mean(mics_positions, axis=0)[None, :]
        mics_positions[:, 2] -= np.max(mics_positions[:, 2])
        mics_positions += mics_loc

    elif mic == 'olympus':
        mics_positions = mics_geom['olympus'].copy() + mics_loc

    n_samples = audio.shape[0]  # shorthand
    n_channels = audio.shape[1]

    # perform VAD
    vad_snd = leds > vad_thresh

    # Now we want to make sure no speech speech goes in estimation of the noise covariance matrix.
    # For that we will remove frames neighbouring the detected speech
    vad_guarded = vad_snd.copy()
    if vad_guard is not None:
        for i, v in enumerate(vad_snd):
            if np.any(vad_snd[i - vad_guard:i + vad_guard]):
                vad_guarded[i] = True

    ##############################
    ## STFT and frame-level VAD ##
    ##############################

    print('STFT and stuff')
    sys.stdout.flush()

    engine = pra.realtime.STFT(nfft,
                               nfft // 2,
                               pra.hann(nfft),
                               channels=audio.shape[1])

    def analysis(x):
        engine.analysis(x)
        return np.moveaxis(engine.X, 1, 0)

    # Now compute the STFT of the microphone input
    X = analysis(audio)
    X_time = np.arange(1, X.shape[0] + 1) * (nfft / 2) / fs_snd

    X_speech = analysis(audio * vad_guarded[:, None])
    X_noise = analysis(audio * (1 - vad_guarded[:, None]))

    S_ref = analysis(speech_ref)
    N_ref = analysis(noise_ref)

    ##########################
    ## MAX SINR BEAMFORMING ##
    ##########################

    print('Max SINR beamformer computation')
    sys.stdout.flush()

    # covariance matrices from noisy signal
    Rs = np.einsum('i...j,i...k->...jk', X_speech, np.conj(X_speech))
    Rn = np.einsum('i...j,i...k->...jk', X_noise, np.conj(X_noise))

    # compute covariances with reference signals to check everything is working correctly
    #Rs = np.einsum('i...j,i...k->...jk', S_ref, np.conj(S_ref))
    #Rn = np.einsum('i...j,i...k->...jk', N_ref, np.conj(N_ref))

    # compute the MaxSINR beamformer
    w = [
        la.eigh(rs, b=rn, eigvals=(n_channels - 1, n_channels - 1))[1]
        for rs, rn in zip(Rs[1:], Rn[1:])
    ]
    w = np.squeeze(np.array(w))
    nw = la.norm(w, axis=1)
    w[nw > 1e-10, :] /= nw[nw > 1e-10, None]
    w = np.concatenate([np.ones((1, n_channels)), w], axis=0)

    if not args.no_norm:
        # normalize with respect to input signal
        z = compute_gain(w,
                         X_speech,
                         X_speech[:, :, 0],
                         clip_up=args.clip_gain)
        w *= z[:, None]

    ###########
    ## APPLY ##
    ###########

    print('Apply beamformer')
    sys.stdout.flush()

    # 2D beamformer
    mic_array = pra.Beamformer(mics_positions[:, :2].T,
                               fs=fs_snd,
                               N=nfft,
                               hop=nfft,
                               zpb=nfft)
    mic_array.signals = audio.T
    mic_array.weights = w.T

    out = mic_array.process()

    # Signal alignment step
    ref = np.vstack([speech_ref[:, 0], noise_ref[:, 0]])
    # Not sure why the delay is sometimes negative here... Need to check more
    delay = np.abs(
        int(pra.tdoa(out, speech_ref[:, 0].astype(np.float), phat=True)))
    if delay > 0:
        out_trunc = out[delay:delay + ref.shape[1]]
        noise_eval = audio[:ref.shape[1], 0] - out_trunc
    else:
        out_trunc = np.concatenate(
            (np.zeros(-delay), out[:ref.shape[1] + delay]))
        noise_eval = audio[:ref.shape[1], 0] - out_trunc
    sig_eval = np.vstack([out_trunc, noise_eval])

    # We use the BSS eval toolbox
    metric = bss_eval_images(ref[:, :, None], sig_eval[:, :, None])

    # we are only interested in SDR and SIR for the speech source
    SDR_out = metric[0][0]
    SIR_out = metric[2][0]

    ##################
    ## SAVE SAMPLES ##
    ##################

    if args.save_sample is not None:

        # for informal listening tests, we need to high pass and normalize the
        # amplitude.
        upper = np.maximum(audio[:, 0].max(), out.max())
        sig_in = pra.highpass(audio[:, 0].astype(np.float) / upper,
                              fs_snd,
                              fc=150)
        sig_out = pra.highpass(out / upper, fs_snd, fc=150)

        f1 = os.path.join(args.save_sample,
                          '{}_ch0_SIR_{}_dB.wav'.format(mic, SIR))
        wavfile.write(f1, fs_snd, sig_in)
        f2 = os.path.join(args.save_sample,
                          '{}_out_SIR_{}_dB.wav'.format(mic, SIR))
        wavfile.write(f2, fs_snd, sig_out)

    ##########
    ## PLOT ##
    ##########

    if args.plot:

        plt.figure()
        plt.plot(out_trunc)
        plt.plot(speech_ref[:, 0])
        plt.legend(['output', 'reference'])

        # time axis for plotting
        led_time = np.arange(leds.shape[0]) / fs_led + 1 / (2 * fs_led)
        audio_time = np.arange(n_samples) / fs_snd

        plt.figure()
        plt.plot(led_time, leds, 'r')
        plt.title('LED signal')

        # match the scales of VAD and light to sound before plotting
        q_vad = np.max(audio)
        q_led = np.max(audio) / np.max(leds)

        plt.figure()
        plt.plot(audio_time, audio[:, 0], 'b')
        plt.plot(led_time, leds * q_led, 'r')
        plt.plot(audio_time, vad_snd * q_vad, 'g')
        plt.plot(audio_time, vad_guarded * q_vad, 'g--')
        plt.legend(['audio', 'VAD'])
        plt.title('LED and audio signals')

        plt.figure()
        a_time = np.arange(audio.shape[0]) / fs_snd
        plt.plot(a_time, audio[:, 0])
        plt.plot(a_time, out_trunc)
        #plt.plot(a_time, speech_ref[:,0])
        plt.legend(['channel 0', 'beamformer output', 'speech reference'])

        plt.figure()
        mic_array.plot_beam_response()
        plt.vlines(
            [180 + np.degrees(theta_speech), 180 - np.degrees(theta_noise)], 0,
            nfft // 2)

        room = pra.ShoeBox(protocol['geometry']['room'][:2],
                           fs=16000,
                           max_order=1)

        room.add_source(noise_loc[:2])  # noise
        room.add_source(speech_loc[:2])  # speech
        room.add_source(
            protocol['geometry']['speakers']['locations'][1][:2])  # signal

        room.add_microphone_array(mic_array)
        room.plot(img_order=1, freq=[800, 1000, 1200, 1400, 1600, 2500, 4000])

        plt.figure()
        mic_array.plot()

        plt.show()

    # Return SDR and SIR
    return SDR_out, SIR_out
import numpy as np
import pyroomacoustics as pra

room = pra.ShoeBox([4, 6], fs=16000, max_order=1)

# add sources in the room
room.add_source([2, 1.5])  # nice source
room.add_source([2, 4.5])  # interferer

# add a circular beamforming array
shape = pra.circular_2D_array([2.5, 3], 8, 0.0, 0.15)
bf = pra.Beamformer(shape, room.fs, Lg=500)
room.add_microphone_array(bf)

# run the ISM
room.image_source_model()

# the noise matrix, note that the size is the number of
# sensors multiplied by the filter size
Rn = np.eye(bf.M * bf.Lg) * 1e-5


def test_rake_max_udr_filters():
    # no interferer
    bf.rake_max_udr_filters(room.sources[0][:4],
                            R_n=Rn,
                            delay=0.015,
                            epsilon=1e-2)
    # with interferer
    bf.rake_max_udr_filters(
        room.sources[0][:4],
Example #12
0
def mic_clean_generator(room_size, target_location, target, fs,
                        microphone_array, amplifier):
    '''
    This function is used to implement single source microphone array clean speech generator.
    
    Usage:  mic_clean_generator(room_size, target_location, target, fs,microphone_array, amplifier)

        room_size                  - the size of room [length, width, high]
        target_location            - the location of target speech [x, y, z]
        target                     - the array of target speech file
        fs                         - sampling frequency
        microphone_array           - the location of microphone array
        amplifier                  - the multiple of microphone's built-in amplifier
    
    Example call:
        clean = mic_clean_generator(room_size, target_location, target, fs, microphone_array, amplifier)

    References:
    mircophone array speech generator release 0.1
    
    Author: Rui Cheng
    '''

    # create the room
    room = pra.ShoeBox(room_size, fs=fs, absorption=1.0, max_order=17)
    '''fig, ax = room.plot()
    ax.set_xlim([0, 4.5])
    ax.set_ylim([0, 6.5])
    ax.set_zlim([0, 4])
    plt.show()
    '''
    # add source
    room.add_source(target_location, signal=target, delay=0)
    #room.add_source([3.5, 3.0, 1.76], signal=interf[:len(target)], delay=0)    # for multi-source
    '''fig, ax = room.plot()
    ax.set_xlim([0, 4.5])
    ax.set_ylim([0, 6.5])
    ax.set_zlim([0, 4])
    plt.show()'''

    # add microphone array
    R = microphone_array
    fft_len = 512
    Lg_t = 0.100
    Lg = np.ceil(Lg_t * room.fs)
    mic_array = pra.Beamformer(R, room.fs, N=fft_len, Lg=Lg)
    room.add_microphone_array(mic_array)
    '''fig, ax = room.plot()
    ax.set_xlim([0, 4.5])
    ax.set_ylim([0, 6.5])
    ax.set_zlim([0, 4])
    plt.show()'''

    # create the room impulse response
    # compute image sources
    room.image_source_model(use_libroom=True)
    # visualize 3D polyhedron room and image sources
    '''fig, ax = room.plot(img_order=3)
    fig.set_size_inches(20, 10)
    plt.show()'''
    '''room.plot_rir()
    fig = plt.gcf()
    fig.set_size_inches(20, 10)
    plt.show()'''

    # microphone speech
    room.simulate()

    # clean speech in each channel
    clean = amplifier * room.mic_array.signals.astype("int16")

    return clean
Example #13
0
def modify_input_wav_beamforming(wav, noise, room_dim, max_order, snr_vals,
                                 mic_array, pos_source, pos_noise, N):

    fs_s, audio_anechoic = wavfile.read(wav)
    fs_n, noise_anechoic = wavfile.read(noise)

    #Create a room for the signal
    room_signal = pra.ShoeBox(room_dim,
                              absorption=0.2,
                              fs=fs_s,
                              max_order=max_order)

    #Create a room for the noise
    room_noise = pra.ShoeBox(room_dim,
                             absorption=0.2,
                             fs=fs_n,
                             max_order=max_order)

    #source of the signal and of the noise in their respectiv boxes
    room_signal.add_source(pos_source, signal=audio_anechoic)
    room_noise.add_source(pos_noise, signal=noise_anechoic)

    #add the microphone array
    mics_signal = pra.Beamformer(mic_array, room_signal.fs, N)
    mics_noisy = pra.Beamformer(mic_array, room_noise.fs, N)
    room_signal.add_microphone_array(mics_signal)
    room_noise.add_microphone_array(mics_noisy)

    #simulate both rooms
    room_signal.simulate()
    room_noise.simulate()

    #take the mic_array.signals from each room
    audio_reverb = room_signal.mic_array.signals
    noise_reverb = room_noise.mic_array.signals

    #design beamforming filters
    mics_signal.rake_delay_and_sum_weights(room_signal.sources[0][:1])
    mics_noisy.rake_delay_and_sum_weights(room_signal.sources[0][:1])

    output_signal = mics_signal.process()
    output_noise = mics_noisy.process()

    #we're going to normalize the noise
    size = np.shape(audio_reverb)
    noise_normalized = np.zeros(size)

    #for each microphones
    if (len(noise_reverb[0]) < len(audio_reverb[0])):
        raise ValueError(
            'the length of the noise signal is inferior to the one of the audio signal !!'
        )
    output_noise = output_noise[:len(output_signal)]

    norm_fact = np.linalg.norm(noise_reverb[-1])
    noise_normalized = output_noise / norm_fact

    #initilialize the array of noisy_signal
    noisy_signal = np.zeros([len(snr_vals), np.shape(output_signal)[0]])

    for i, snr in enumerate(snr_vals):
        noise_std = np.linalg.norm(audio_reverb[-1]) / (10**(snr / 20.))
        final_noise = noise_normalized * noise_std
        noisy_signal[i] = pra.normalize(
            pra.highpass(output_signal + final_noise, fs_s))

    return noisy_signal
Example #14
0
    #create the room
    room = pra.ShoeBox(
        room_dim,
        absorption=0.2,
        fs=fs_s,
        t0=t0,
        max_order=max_order,
        sigma2_awgn=5e-7)

    #add the sources
    room.add_source(pos_source,signal=audio_anechoic,delay=0.)
    room.add_source(pos_noise,signal=noise_anechoic,delay=1.0)

    #add the microphone array and compute RIR
    mics = pra.Beamformer(R, room.fs,N,Lg=Lg)
    room.add_microphone_array(mics)
    room.compute_rir()
    room.simulate()

    #design the beamforming filters using some of the images sources
    good_sources = room.sources[0][:max_order_design+1]
    bad_sources = room.sources[1][:max_order_design+1]
    mics.rake_mvdr_filters(good_sources,bad_sources,5e-7*np.eye(mics.Lg*mics.M),delay=delay)

    #process the signal
    noisy_signal_beamforming = mics.process()
    out_RakeMVDR = pra.highpass(noisy_signal_beamforming,room.fs).astype(np.int16)
    dest = os.path.join(dest_dir,"beamforming_signal.wav")
    wavfile.write(dest,16000,out_RakeMVDR)
    score_beamformer = label_wav(dest, labels_file, graph_file, speech.meta.as_dict()['word'])
Example #15
0
def DB_generate(source_audio, out_folder, name):

    #source_audio = pra.normalize(source_audio, bits=16)

    mic_distance = random.randint(
        1, 20)  # mean distance from source to microphones
    source_position = np.array([
        random.uniform(0, room_dimensions[0]),
        random.uniform(0, room_dimensions[1])
    ])

    # random way: guess array center until it's in the room: very long time for small rooms
    mic_in_room = False
    while mic_in_room == False:
        theta = random.uniform(0, 2 * math.pi)
        mic_center = source_position - mic_distance * np.array(
            [math.cos(theta), math.sin(theta)])
        print(mic_center)
        if (0 <= mic_center[0] <= room_dimensions[0]) and (
                0 <= mic_center[1] <= room_dimensions[1]):
            mic_in_room = True

    # number of lateral microphones
    M = 4
    # counterclockwise rotation of array:
    phi = 0
    # distance between microphones
    d = 0.4

    mic_pos = pra.beamforming.linear_2D_array(mic_center, M, phi, d)
    mic_pos = np.concatenate((mic_pos, np.array(mic_center, ndmin=2).T),
                             axis=1)

    distances = []
    for m in range(M):
        d = math.sqrt((source_position[0] - mic_pos[0, m])**2 +
                      (source_position[1] - mic_pos[1, m])**2)
        distances.append(d)

    # create room
    shoebox = pra.ShoeBox(
        room_dimensions,
        absorption=wall_absorption,
        fs=fs,
        max_order=15,
    )

    # shoebox.mic_array.to_wav(os.path.join(out_folder + '_DB', 'mix_' + name), norm=True, bitdepth=np.int16)

    Lg_t = 0.100  # filter size in seconds
    Lg = np.ceil(Lg_t * fs)  # in samples
    fft_len = 512

    mics = pra.Beamformer(mic_pos, shoebox.fs, N=fft_len, Lg=Lg)

    shoebox.add_source(source_position, signal=source_audio)
    shoebox.add_microphone_array(mics)
    shoebox.compute_rir()
    shoebox.simulate()

    # ADDING NOISE

    for n in range(M + 1):
        signal = np.asarray(shoebox.mic_array.signals[n, :], dtype=float)
        signal = pra.utilities.normalize(signal, bits=16)

        mixed_signal = add_noise(source_audio, signal)

        mixed_signal = np.array(mixed_signal, dtype=np.int16)

        mixed_file = os.path.join(out_folder, 'mix%d_%s' % (n, name))
        pp.write_audio(mixed_file, mixed_signal, fs)
noise_loc = np.r_[2.5, 4.5]

SIR = 25  # decibels
SINR = SIR - 1  # decibels

sigma_i, sigma_n = compute_variances(SIR, SINR, src_loc, noise_loc, mics_loc.mean(axis=1), sigma_s=sigma_s)

interference_audio = np.random.randn(target_audio.shape[0] + fs_sound) * sigma_i

room = pra.ShoeBox([6,5], fs=16000, max_order=12, absorption=0.4, sigma2_awgn=sigma_n**2)
room.add_source(src_loc, signal=target_audio)
room.add_source(noise_loc, signal=interference_audio)

# conventional microphone array
M = mics_loc.shape[1]
mics = pra.Beamformer(mics_loc, fs=fs_sound, N=nfft, hop=nfft // 2, zpb=nfft)
room.add_microphone_array(mics)

room.simulate()

# sound-to-light sensor
# we assume there is no propagation delay between speaker and sensor
leds = LightArray2(src_loc, fs=fs_light)
leds.record(target_audio + np.random.randn(*target_audio.shape) * sigma_n, fs=fs_sound)
leds_sig = leds.signals - leds.signals.min()
leds_sig /= leds_sig.max()
leds_time = np.arange(leds.signals.shape[0]) / fs_light

# perform VAD on the light signal
vad = leds.signals > vad_thresh
max_order_design = 1  # maximum image generation used in design
shape = 'Circular'  # array shape

# TD filter length
Lg_t = 0.05  # Filter size in seconds
Lg = int(np.ceil(Lg_t * Fs))
Lgp = np.floor(0.4 * Lg)
Lgm = Lg - Lgp
print 'Lg=', Lg

# create a microphone array
if shape is 'Circular':
    R = circular2DArray(mic1, M, phi, d * M / (2 * np.pi))
else:
    R = pra.linear2DArray(mic1, M, phi, d)
mics = pra.Beamformer(R, Fs, N, Lg=Lg, hop=hop, zpf=zp, zpb=zp)

# The first signal (of interest) is singing
rate1, signal1 = wavfile.read('samples/singing_' + str(Fs) + '.wav')
signal1 = np.array(signal1, dtype=float)
signal1 = pra.normalize(signal1)
signal1 = pra.highpass(signal1, Fs)
delay1 = 0.

# the second signal (interferer) is some german speech
rate2, signal2 = wavfile.read('samples/german_speech_' + str(Fs) + '.wav')
signal2 = np.array(signal2, dtype=float)
signal2 = pra.normalize(signal2)
signal2 = pra.highpass(signal2, Fs)
delay2 = 1.
def process_experiment_max_sinr(SIR, mic, blinky, args):

    session = args.session
    target = args.target

    with open(metadata_file.format(session=args.session), 'r') as f:
        metadata = json.load(f)

    file_pattern = os.path.join(experiment_folder, metadata['filename_pattern'])

    with open(protocol_file.format(session=args.session), 'r') as f:
        protocol = json.load(f)

    nfft = args.nfft
    vad_guard = args.vad_guard
    if args.thresh is None:
        vad_thresh = thresh_opt[SIR]
    else:
        vad_thresh = args.thresh

    # read_in the mix signals
    fs_led, leds   = wavfile.read(file_pattern.format(
        session=session, snr=SIR, mic=blinky, source='mix', fs=fs))
    fs_snd, audio  = wavfile.read(file_pattern.format(
        session=session, snr=SIR, mic=mic_choices[mic], source='mix', fs=fs))
    assert fs_led == fs_snd

    # read in the ref signals
    sources_ref  = dict(zip(target_choices,
        [ wavfile.read(file_pattern.format(
                session=session, mic=mic_choices[mic], snr=SIR, source=ch, fs=fs))[1]
            for ch in target_choices ]))
    leds_ref  = dict(zip(target_choices,
        [ wavfile.read(file_pattern.format(
                session=session, mic=blinky, snr=SIR, source=ch, fs=fs))[1]
            for ch in target_choices ]))

    # reorder with target in first position
    ref = np.array([sources_ref[target]] + [sources_ref[ch]
                for ch in target_choices if ch != target])

    noise_ref = np.zeros_like(sources_ref[target])
    n_ch = [ch for ch in target_choices if ch != target]
    for ch in n_ch:
        noise_ref += sources_ref[ch]

    # In case of objective evaluation, we do an artificial mix
    if args.synth_mix:
        audio = sources_ref[target] + noise_ref

    # get the geometry information to get nice plots.
    mics_geom = {
            'pyramic' : np.array(protocol['geometry']['microphones']['pyramic']['locations']),
            'camera'  : np.array(protocol['geometry']['microphones']['camera']['locations']),
            }

    mics_loc = np.array(protocol['geometry']['microphones'][mic_choices[mic]]['reference'])
    noise_loc = protocol['geometry']['speakers']['locations'][0]
    speech_loc = protocol['geometry']['speakers']['locations'][1]

    # the directions of arrival
    theta_speech = 0
    p0 = speech_loc - mics_loc
    p1 = noise_loc - mics_loc
    theta_noise = np.arccos(np.inner(p0, p1) / la.norm(p0) / la.norm(p1))
    print('Source separation', theta_noise / np.pi * 180)

    if 'pyramic' in mic:

        if mic == 'pyramic_2':
            I = pyramic_bss_2ch
        elif mic == 'pyramic_4':
            I = pyramic_bss_4ch
        elif mic == 'pyramic_24':
            I = list(range(8,16)) + list(range(24,32)) + list(range(40,48)) # flat part
        elif mic == 'pyramic_48':
            I = list(range(48))
        else:
            raise ValueError('Unsupported configuration')

        audio = audio[:,I]
        noise_ref = noise_ref[:,I].copy()
        ref = ref[:,:,I].copy()

        mics_positions = mics_geom['pyramic'][I].copy()
        # place in room 2-806
        mics_positions -= np.mean(mics_positions, axis=0)[None,:]
        mics_positions[:,2] -= np.max(mics_positions[:,2])
        mics_positions += mics_loc

    elif mic == 'camera':
        mics_positions = mics_geom['camera'].copy() + mics_loc


    n_samples = audio.shape[0]  # shorthand
    n_channels = audio.shape[1]

    # adjust length of led signal if necessary
    if leds.shape[0] < audio.shape[0]:
        z_missing = audio.shape[0] - leds.shape[0]
        leds = np.pad(leds, (0,z_missing), 'constant')
    elif leds.shape[0] > audio.shape[0]:
        leds = leds[:audio.shape[0],]

    # perform VAD
    led_target = leds[:,blinky_source_map[target]]
    vad_snd = led_target > vad_thresh

    # Now we want to make sure no speech speech goes in estimation of the noise covariance matrix.
    # For that we will remove frames neighbouring the detected speech
    vad_guarded = vad_snd.copy()
    if vad_guard is not None:
        for i,v in enumerate(vad_snd):
            if np.any(vad_snd[i-vad_guard:i+vad_guard]):
                vad_guarded[i] = True

    ##############################
    ## STFT and frame-level VAD ##
    ##############################

    print('STFT and stuff')
    sys.stdout.flush()

    a_win = pra.hann(nfft)
    s_win = pra.realtime.compute_synthesis_window(a_win, nfft // 2)

    engine = pra.realtime.STFT(nfft, nfft // 2,
            analysis_window=a_win, synthesis_window=s_win,
            channels=audio.shape[1])

    # Now compute the STFT of the microphone input
    X = engine.analysis(audio)
    X_time = np.arange(1, X.shape[0]+1) * (nfft / 2) / fs_snd

    X_speech = engine.analysis(audio * vad_guarded[:audio.shape[0],None])
    X_noise = engine.analysis(audio * (1 - vad_guarded[:audio.shape[0],None]))

    ##########################
    ## MAX SINR BEAMFORMING ##
    ##########################

    print('Max SINR beamformer computation')
    sys.stdout.flush()

    # covariance matrices from noisy signal
    Rs = np.einsum('i...j,i...k->...jk', X_speech, np.conj(X_speech))
    Rn = np.einsum('i...j,i...k->...jk', X_noise, np.conj(X_noise)) 
    Rall = Rs + Rn

    # compute the MaxSINR beamformer
    w = [la.eigh(rs, b=rn, eigvals=(n_channels-1,n_channels-1))[1] for rs,rn in zip(Rall[1:], Rn[1:])]
    w = np.squeeze(np.array(w))
    nw = la.norm(w, axis=1)
    w[nw > 1e-10,:] /= nw[nw > 1e-10,None]
    w = np.concatenate([np.ones((1,n_channels)), w], axis=0)  # add dummy beamformer at DC

    if not args.no_norm:
        # normalize with respect to input signal
        z = compute_gain(w, X_speech, X_speech[:,:,0], clip_up=args.clip_gain)
        w *= z[:,None]


    ###########
    ## APPLY ##
    ###########

    print('Apply beamformer')
    sys.stdout.flush()

    # 2D beamformer
    mic_array = pra.Beamformer(mics_positions[:,:2].T, fs=fs_snd, N=nfft, hop=nfft, zpb=nfft)
    mic_array.signals = audio.T
    mic_array.weights = w.T

    out = mic_array.process()

    # Signal alignment step

    # Not sure why the delay is sometimes negative here... Need to check more
    delay = int(pra.tdoa(out, ref[0,:,0].astype(np.float), phat=True))
    print(delay)
    delay = np.abs(delay)
    if delay > 0:
        out_trunc = out[delay:delay+ref.shape[1]]
    else:
        out_trunc = np.concatenate((np.zeros(-delay), out[:ref.shape[1]+delay]))
    sig_eval = np.vstack([out_trunc] * len(target_choices))

    # We use the BSS eval toolbox
    metric = bss_eval_images(ref[:,:,0], sig_eval[:,:,None])

    # we are only interested in SDR and SIR for the speech source
    ret = { 'Max-SINR' : {'SDR' : metric[0][0], 'SIR' : metric[2][0]} }


    #############################
    ## BLIND SOURCE SEPARATION ##
    #############################

    if mic in ['camera', 'pyramic_2', 'pyramic_4']:

        Y = pra.bss.auxiva(X, n_iter=40)
        bss = pra.realtime.synthesis(Y, nfft, nfft // 2, win=s_win)

        match = []
        for col in range(bss.shape[1]):
            xcorr = fast_corr(bss[:,col], ref[0,:,0])
            match.append(np.max(xcorr))
        best_col = np.argmax(match)

        # Not sure why the delay is sometimes negative here... Need to check more
        delay = np.abs(int(pra.tdoa(bss[:,best_col], ref[0,:,0].astype(np.float), phat=True)))
        if delay > 0:
            bss_trunc = bss[delay:delay+ref.shape[1],]
        elif delay < 0:
            bss_trunc = np.concatenate((np.zeros((-delay, bss.shape[1])), bss[:ref.shape[1]+delay]))
        else:
            bss_trunc = bss[:ref.shape[1],]

        if ref.shape[1] > bss_trunc.shape[0]:
            ref_lim = bss_trunc.shape[0]
        else:
            ref_lim = ref.shape[1]

        if mic in ['camera', 'pyramic_2']:
            bss_trunc = np.hstack([bss_trunc] * 2)

        metric = bss_eval_images(ref[:,:ref_lim,0,None], bss_trunc.T[:,:,None])
        SDR_bss = metric[0][0]
        SIR_bss = metric[2][0]
        ret['BSS'] = { 'SDR' : metric[0][0], 'SIR' : metric[2][0] }

    #################################
    ## Estimate SDR and SIR of mix ##
    #################################

    # Not sure why the delay is sometimes negative here... Need to check more
    delay = np.abs(int(pra.tdoa(audio[:,0], ref[0,:,0].astype(np.float), phat=True)))
    if delay > 0:
        audio_trunc = audio[delay:delay+ref.shape[1],0]
    elif delay < 0:
        audio_trunc = np.concatenate((np.zeros(-delay), audio[:ref.shape[1]+delay,0]))
    else:
        audio_trunc = audio[:ref.shape[1],0]

    if ref.shape[1] > audio_trunc.shape[0]:
        ref_lim = audio_trunc.shape[0]
    else:
        ref_lim = ref.shape[1]

    audio_trunc = np.vstack([audio_trunc] * len(ref))

    metric = bss_eval_images(ref[:,:ref_lim,0,None], audio_trunc[:,:,None])
    SDR_bss = metric[0][0]
    SIR_bss = metric[2][0]
    ret['Mix'] = { 'SDR' : metric[0][0], 'SIR' : metric[2][0] }

    ##################
    ## SAVE SAMPLES ##
    ##################

    if args.save_sample is not None:

        if not os.path.exists(args.save_sample):
            os.makedirs(args.save_sample)

        # for informal listening tests, we need to high pass and normalize the
        # amplitude.
        if mic in ['camera', 'pyramic_2', 'pyramic_4']:
            upper = np.max([audio[:,0].max(), out.max(), bss.max(), ref[0,:,0].max()])
        else:
            upper = np.max([audio[:,0].max(), out.max(), ref[0,:,0].max()])


        # Clean signal for reference
        sig_ref = pra.highpass(ref[0,:,0].astype(np.float) / upper, fs_snd, fc=150)
        f0 = os.path.join(args.save_sample, '{}_ref_SIR_NA_dB.wav'.format(mic))
        wavfile.write(f0, fs_snd, sig_ref)

        # Mix signal for reference
        sig_mix = pra.highpass(audio[:,0].astype(np.float) / upper, fs_snd, fc=150)
        f1 = os.path.join(args.save_sample, '{}_mix_SIR_{}_dB.wav'.format(mic, SIR))
        wavfile.write(f1, fs_snd, sig_mix)

        # Output of MaxSINR
        sig_out = pra.highpass(out / upper, fs_snd, fc=150)
        f2 = os.path.join(args.save_sample, '{}_maxsinr_SIR_{}_dB.wav'.format(mic, SIR))
        wavfile.write(f2, fs_snd, sig_out)

        # Output of BSS
        if mic in ['camera', 'pyramic_2', 'pyramic_4']:
            sig_bss = pra.highpass(bss[:,best_col] / upper, fs_snd, fc=150)
            f3 = os.path.join(args.save_sample, '{}_bss_SIR_{}_dB.wav'.format(mic, SIR))
            wavfile.write(f3, fs_snd, sig_bss)


    ##########
    ## PLOT ##
    ##########

    if args.plot:

        plt.figure()
        plt.plot(out_trunc)
        plt.plot(ref[0,:,0])
        plt.legend(['output', 'reference'])

        # time axis for plotting
        led_time = np.arange(led_target.shape[0]) / fs_led + 1 / (2 * fs_led)
        audio_time = np.arange(n_samples) / fs_snd

        plt.figure()
        plt.plot(led_time, led_target, 'r')
        plt.title('LED signal')

        # match the scales of VAD and light to sound before plotting
        q_vad = np.max(audio)
        q_led = np.max(audio) / np.max(led_target)

        plt.figure()
        plt.plot(audio_time, audio[:,0], 'b') 
        plt.plot(led_time, led_target * q_led, 'r')
        plt.plot(audio_time, vad_snd * q_vad, 'g')
        plt.plot(audio_time, vad_guarded * q_vad, 'g--')
        plt.legend(['audio','VAD'])
        plt.title('LED and audio signals')

        plt.figure()
        a_time = np.arange(audio.shape[0]) / fs_snd
        plt.plot(a_time, audio[:,0])
        plt.plot(a_time, out_trunc)
        plt.legend(['channel 0', 'beamformer output', 'speech reference'])

        '''
        plt.figure()
        mic_array.plot_beam_response()
        plt.vlines([180+np.degrees(theta_speech), 180-np.degrees(theta_noise)], 0, nfft // 2)

        room = pra.ShoeBox(protocol['geometry']['room'][:2], fs=16000, max_order=1)

        room.add_source(noise_loc[:2])   # noise
        room.add_source(speech_loc[:2])  # speech
        room.add_source(protocol['geometry']['speakers']['locations'][1][:2])  # signal

        room.add_microphone_array(mic_array)
        room.plot(img_order=1, freq=[800, 1000, 1200, 1400, 1600, 2500, 4000])
        '''

        plt.figure()
        mic_array.plot()

        plt.show()


    # Return SDR and SIR
    return ret
shape = 'Linear'  # array shape
Lg_t = 0.050  # Filter size in seconds
Lg = np.ceil(Lg_t * Fs)  # Filter size in samples
delay = 0.03

# define the FFT length
N = 1024

# create a microphone array
if shape is 'Circular':
    R = pra.circular2DArray(mic1, M, phi, d * M / (2 * np.pi))
elif shape is 'Poisson':
    R = pra.poisson2DArray(mic1, M, d)
else:
    R = pra.linear2DArray(mic1, M, phi, d)
mics = pra.Beamformer(R, Fs, N=N, Lg=Lg)

# The first signal (of interest) is singing
rate1, signal1 = wavfile.read('samples/singing_' + str(Fs) + '.wav')
signal1 = np.array(signal1, dtype=float)
signal1 = pra.normalize(signal1)
signal1 = pra.highpass(signal1, Fs)
delay1 = 0.

# the second signal (interferer) is some german speech
rate2, signal2 = wavfile.read('samples/german_speech_' + str(Fs) + '.wav')
signal2 = np.array(signal2, dtype=float)
signal2 = pra.normalize(signal2)
signal2 = pra.highpass(signal2, Fs)
delay2 = 1.
Example #20
0
center = [1, 2]
fft_len = 512
echo = pra.circular_2D_array(center=center, M=6, phi0=0, radius=radius)
echo = np.concatenate((echo, np.array(center, ndmin=2).T), axis=1)
sigma2_n = 5e-7
max_order_design = 1

for alpha in alphas:
    corners = np.array([[0, 0], [0, 4], [6, 4], [6, 1], [2, 1],
                        [2, 0]]).T  # [x,y]
    roomPoly = pra.Room.from_corners(corners,
                                     fs=Fs,
                                     max_order=12,
                                     absorption=alpha)
    mics = pra.Beamformer(echo, Fs, N=fft_len, Lg=Lg)
    roomPoly.add_microphone_array(mics)
    roomPoly.add_source(source, delay=0, signal=xtone)
    roomPoly.add_source(interferer, delay=0, signal=silence)
    roomPoly.image_source_model(use_libroom=True)
    roomPoly.compute_rir()
    roomPoly.simulate()

    # Rake MVDR simulation
    BeamformerType = 'RakeMVDR'
    good_sources = roomPoly.sources[0][:max_order_design + 1]
    bad_sources = roomPoly.sources[1][:max_order_design + 1]
    mics.rake_mvdr_filters(good_sources, bad_sources,
                           sigma2_n * np.eye(mics.Lg * mics.M))
    output = mics.process()
    out = pra.normalize(pra.highpass(output, Fs))
def perceptual_quality_evaluation(room_dim, mics, good_pos, good_index,
                                  bad_pos, bad_index, rir_location):
    print 'start'

    import numpy as np
    from scipy.io import wavfile
    from os import getpid

    import pyroomacoustics as pra

    # number of sources to  consider
    n_sources = np.arange(1, 8)
    S = n_sources.shape[0]

    # number of mics
    n_mic = mics.shape[1]

    # Set the speed of sound to match that of the measured RIR
    pra.constants.set('c', 345.5)

    Fs = 8000.
    N = 1024
    Lg = int(0.03 * Fs)  # 350 ms long filter
    delay_bf = 0.02
    sigma2_n = 1e-6

    # reflection coefficients from the walls (hand-waving)
    reflection = {
        'ground': 0.8,
        'south': 0.8,
        'west': 0.8,
        'north': 0.8,
        'east': 0.8,
        'ceilling': 0.5
    }

    speech_sample1 = 'samples/fq_sample1_8000.wav'
    speech_sample2 = 'samples/fq_sample2_8000.wav'

    # Create the room
    room = pra.ShoeBox3D(np.zeros(3),
                         room_dim,
                         Fs,
                         max_order=1,
                         absorption=reflection,
                         sigma2_awgn=sigma2_n)

    # Create the beamformer
    bf = pra.Beamformer(mics, Fs, N=N, Lg=Lg)
    room.addMicrophoneArray(bf)

    # data receptacles
    beamformer_names = ['Rake Perceptual', 'Rake MVDR']
    bf_weights_fun = [bf.rakePerceptualFilters, bf.rakeMVDRFilters]
    bf_fnames = ['1', '2']
    NBF = len(beamformer_names)

    # receptacle arrays
    pesq_input = np.zeros(2)
    pesq_bf = np.zeros((2, NBF, S))

    # create a single reference mic at position of microphone 4
    ref_mic_n = 4
    ref_mic = pra.MicrophoneArray(bf.R[:, ref_mic_n, np.newaxis], Fs)

    # since we run multiple thread, we need to uniquely identify filenames
    pid = str(getpid())

    file_ref = 'output_samples/fqref' + pid + '.wav'
    file_suffix = '-' + pid + '.wav'
    files_bf = [
        'output_samples/fq' + str(i + 1) + file_suffix for i in xrange(NBF)
    ]
    file_raw = 'output_samples/fqraw' + pid + '.wav'

    # index of good and bad sources
    good = good_index
    bad = bad_index

    # Read the two speech samples used
    rate, good_signal = wavfile.read(speech_sample1)
    good_signal = np.array(good_signal, dtype='float64')
    good_signal = pra.normalize(good_signal)
    good_signal = pra.highpass(good_signal, rate)
    good_len = good_signal.shape[0] / float(Fs)

    rate, bad_signal = wavfile.read(speech_sample2)
    bad_signal = np.array(bad_signal, dtype='float64')
    bad_signal = pra.normalize(bad_signal)
    bad_signal = pra.highpass(bad_signal, rate)
    bad_len = bad_signal.shape[0] / float(Fs)

    # variance of good signal
    good_sigma2 = np.mean(good_signal**2)

    # normalize interference signal to have equal power with desired signal
    bad_signal *= good_sigma2 / np.mean(bad_signal**2)

    # pick good source position at random
    good_distance = np.linalg.norm(bf.center[:, 0] - good_pos)

    # pick bad source position at random
    bad_distance = np.linalg.norm(bf.center[:, 0] - bad_pos)

    if good_len > bad_len:
        good_delay = 0
        bad_delay = (good_len - bad_len) / 2.
    else:
        bad_delay = 0
        good_delay = (bad_len - good_len) / 2.

    # create the reference room for freespace, noisless, no interference simulation
    ref_room = pra.ShoeBox3D([0, 0, 0], room_dim, Fs, max_order=0)
    ref_room.addSource(good_pos, signal=good_signal, delay=good_delay)
    ref_room.addMicrophoneArray(ref_mic)
    ref_room.compute_RIR()
    ref_room.simulate()
    reference = pra.highpass(ref_mic.signals[0], Fs)
    reference_n = pra.normalize(reference)

    # save the reference desired signal
    #wavfile.write(file_ref, Fs, pra.to_16b(reference_n))

    new_ref = good_signal.copy()
    new_ref = pra.normalize(pra.highpass(new_ref, Fs))
    wavfile.write(file_ref, Fs, pra.to_16b(new_ref))

    # add the sources to the 'real' room
    room.addSource(good_pos, signal=good_signal, delay=good_delay)
    room.addSource(bad_pos, signal=bad_signal, delay=bad_delay)

    # read in the RIR from file
    for r in range(n_mic):
        for s in [good_index, bad_index]:

            # read wav file
            fname_rir = rir_location % (r + 1, s + 1)
            rir_fs, rir = wavfile.read(fname_rir)
            rir = np.array(rir, dtype='float64')

            if rir_fs != Fs:
                raise NameError(
                    'The RIR and the signals do not have the same sampling rate.'
                )
                '''
                import scikits.samplerate as sr
                rir = sr.resample(rir, Fs/float(rir_fs), 'sinc_best')

                # the factor 2 was empirically determined to be necessary to get
                # amplitude of RIR in the correct ballpark.
                rir *= 2.
                '''

            room.rir.append([])
            room.rir[r].append(rir)

    # compute the input signal to the microphones
    room.simulate()

    # save degraded signal at reference microphone
    raw = bf.signals[ref_mic_n]
    raw_n = pra.normalize(pra.highpass(raw, Fs))
    wavfile.write(file_raw, Fs, pra.to_16b(raw_n))

    pesq_input = pra.pesq(file_ref, file_raw, Fs=Fs)

    for src in room.sources:
        src.setOrdering('strongest', ref_point=bf.center)

    for k, s in enumerate(n_sources):

        good_img = room.sources[0][:s]
        bad_img = room.sources[1][:s]

        for i, bfr in enumerate(beamformer_names):

            bf_weights_fun[i](good_img,
                              bad_img,
                              sigma2_n * np.eye(n_mic * Lg),
                              delay=delay_bf)

            # run beamformer
            output = bf.process()
            output = pra.normalize(pra.highpass(output, Fs))
            output = pra.time_align(reference_n, output)

            # save files for PESQ evaluation
            wavfile.write(files_bf[i], Fs, pra.to_16b(output))

            # compute PESQ
            x = pra.pesq(file_ref, files_bf[i], Fs=Fs)
            pesq_bf[:, i, k] = pra.pesq(file_ref, files_bf[i], Fs=Fs).T
    ''' This is how you can compare the true RIRs with the image src model generated one
    plt.figure()
    for m in range(n_mic):

        rir_sim = room.sources[0].getRIR(mics[:,m], Fs)
        plt.subplot(3,3,m+1)
        plt.plot(room.rir[m][0][:rir_sim.shape[0]])
        plt.plot(rir_sim)

    plt.show()
    '''

    print 'Finished'

    return pesq_input, pesq_bf
Example #22
0
    pos_noise = [2.8, 4.3]
    fft_len = 1024

    # use circular array with center mic
    center = np.array([2, 1.5])
    radius = 0.2
    R = pra.circular_2D_array(center, M=6, phi0=0, radius=radius)
    R = np.concatenate((R, np.array(center, ndmin=2).T), axis=1)

    # visualize the setup
    room = pra.ShoeBox(room_dim,
                       absorption=absorption_fact,
                       max_order=max_order)
    room.add_source(pos_source)
    room.add_source(pos_noise)
    room.add_microphone_array(pra.Beamformer(R, room.fs, N=fft_len))
    room.mic_array.rake_delay_and_sum_weights(room.sources[0][:1])
    room.plot(freq=[500, 1000, 2000, 4000], img_order=0)
    plt.title("Simulation setup and polar patterns")
    plt.legend(['500', '1000', '2000', '4000'])
    plt.grid()

    #create object
    dataset = pra.datasets.GoogleSpeechCommands(download=True, subset=1)

    #separate the noise and the speech samples
    noise_samps = dataset.filter(speech=0)
    speech_samps = dataset.filter(speech=1)
    speech_samps = speech_samps.filter(word=desired_word)

    #pick one of each from WAV