コード例 #1
0
ファイル: generators.py プロジェクト: zzz9999/FRIDA
def gen_speech_at_mic_stft(phi_ks,
                           source_signals,
                           mic_array_coord,
                           noise_power,
                           fs,
                           fft_size=1024):
    """
    generate microphone signals with short time Fourier transform
    :param phi_ks: azimuth of the acoustic sources
    :param source_signals: speech signals for each arrival angle, one per row
    :param mic_array_coord: x and y coordinates of the microphone array
    :param noise_power: the variance of the microphone noise signal
    :param fs: sampling frequency
    :param fft_size: number of FFT bins
    :return: y_hat_stft: received (complex) signal at microphones
             y_hat_stft_noiseless: the noiseless received (complex) signal at microphones
    """
    frame_shift_step = np.int(fft_size /
                              1.)  # half block overlap for adjacent frames
    K = source_signals.shape[0]  # number of point sources
    num_mic = mic_array_coord.shape[1]  # number of microphones

    # Generate the impulse responses for the array and source directions
    impulse_response = gen_far_field_ir(np.reshape(phi_ks, (1, -1), order='F'),
                                        mic_array_coord, fs)
    # Now generate all the microphone signals
    y = np.zeros(
        (num_mic, source_signals.shape[1] + impulse_response.shape[2] - 1),
        dtype=np.float32)
    for src in xrange(K):
        for mic in xrange(num_mic):
            y[mic] += fftconvolve(impulse_response[src, mic],
                                  source_signals[src])

    # Now do the short time Fourier transform
    # The resulting signal is M x fft_size/2+1 x number of frames
    y_hat_stft_noiseless = \
        np.array([pra.stft(signal, fft_size, frame_shift_step, transform=mkl_fft.rfft).T
                  for signal in y]) / np.sqrt(fft_size)

    # Add noise to the signals
    y_noisy = y + np.sqrt(noise_power) * np.array(np.random.randn(*y.shape),
                                                  dtype=np.float32)
    # compute sources stft
    source_stft = \
        np.array([pra.stft(s_loop, fft_size, frame_shift_step, transform=mkl_fft.rfft).T
                  for s_loop in source_signals]) / np.sqrt(fft_size)

    y_hat_stft = \
        np.array([pra.stft(signal, fft_size, frame_shift_step, transform=mkl_fft.rfft).T
                  for signal in y_noisy]) / np.sqrt(fft_size)

    return y_hat_stft, y_hat_stft_noiseless, source_stft
コード例 #2
0
ファイル: script_rtf.py プロジェクト: YoussefJanjar/alexa
def feature_Vector_testPos(index_src, D=256):
    '''
    This function creates the feature vector for given test position using the microphones in the room
    '''
    
    overlap = 1
    rir0 = room_test.rir[0][index_src]
    rir1 = room_test.rir[1][index_src]
    len0, len1 = rir0.shape[0], rir1.shape[0]

    atf_0 = pra.stft(rir0, L=D, hop=int(len0 * overlap), win=pra.windows.hann(N=256))
    atf_1 = pra.stft(rir1, L=D, hop=int(len1 * overlap), win=pra.windows.hann(N=256))
    
    return atf_0 / atf_1
コード例 #3
0
ファイル: matrix_doa.py プロジェクト: zhouxzh/doa
def matrix_doa():
    global source_signal
    #print(source_signal)
    #algo_names = ['SRP', 'MUSIC', 'TOPS', 'CSSM', 'WAVES']
    algo_name = 'SRP'
    #print('The algorithms {} will be used.'.format(algo_name))
    nfft = 256  # FFT size
    ################################
    # Compute the STFT frames needed
    X = np.array([
        pra.stft(source_signal[:, i], nfft, nfft // 2, transform=np.fft.rfft).T
        for i in range(CHANNELS)
    ])

    ##############################################
    # Construct the new DOA object
    # the max_four parameter is necessary for FRIDA only
    doa = pra.doa.algorithms[algo_name](R, fs, nfft, c=c)

    # this call here perform localization on the frames in X
    doa.locate_sources(X, freq_range=[1000, 3000])

    # doa.azimuth_recon contains the reconstructed location of the source
    angle = doa.azimuth_recon / np.pi * 180
    print('  Recovered azimuth:', angle, 'degrees')
    return (angle)
コード例 #4
0
def run_doa(angle, h, algo, doa_kwargs, freq_bins, speakers_numbering):
    ''' Run the doa localization for one source location and one algorithm '''

    # Prepare the DOA localizer object
    algo_key = doa_kwargs['algo_obj']
    doa = pra.doa.algorithms[algo_key](mic_array,
                                       fs,
                                       nfft,
                                       c=c,
                                       num_src=1,
                                       dim=3,
                                       **doa_kwargs)

    # get the loudspeaker index from its name
    spkr = speakers_numbering[h]

    # open the recording file
    filename = fn.format(name=sample_name, spkr=spkr, angle=angle)
    fs_data, data = wavfile.read(filename)

    if fs_data != fs:
        raise ValueError('Sampling frequency mismatch')

    # do time-freq decomposition
    X = np.array([
        pra.stft(signal, nfft, stft_hop, transform=np.fft.rfft).T
        for signal in data.T
    ])

    # run doa
    doa.locate_sources(X, freq_bins=freq_bins)
    col = float(doa.colatitude_recon[0])
    az = float(doa.azimuth_recon[0])

    # manual calibration groundtruth
    col_gt_man = locations['speakers_manual_colatitude'][h]
    az_gt_man = np.radians(int(angle))
    error_man = pra.doa.great_circ_dist(1., col, az, col_gt_man, az_gt_man)

    # optimized calibration groundtruth
    col_gt_opt = locations['sources'][h]['colatitude'][angle]
    az_gt_opt = locations['sources'][h]['azimuth'][angle]
    error_opt = pra.doa.great_circ_dist(1., col, az, col_gt_opt, az_gt_opt)

    print(algo, h, angle, ': Err Man=', error_man, 'Opt=', error_opt)

    return {
        'algo': algo,
        'angle': angle,
        'spkr_height': h,
        'loc_man': (col_gt_man, az_gt_man),
        'loc_opt': (col_gt_opt, az_gt_opt),
        'loc_doa': (col, az),
        'error_man': float(error_man),
        'error_opt': float(error_opt),
    }
コード例 #5
0
def difference_of_arrivals(speed_of_sound, signal_list, algorithm_name,
                           num_sources, *mic_location):
    """Gets an azimuth and co-latitude for each pair of microphones.

        Args:
            speed_of_sound: specific speed of sound
            signal_list: the microphone signals
            algorithm_name: specific distance of arrival (DOA) method
            num_sources: number of sources to find
            mic_location: location of each microphone

        Returns:
             doa.azimuth_recon: Azimuth angle
             doa.colatitude_recon: Co-latitude angle
    """

    # Constants
    fs = 16000  # sampling frequency
    nfft = 256  # FFT size

    # Add n-microphone array in [x,y,z] order
    m = np.vstack(list(zip(*mic_location)))

    # Create an array of a short fourier transformed frequency signal
    x = np.array([
        pra.stft(signal, nfft, nfft // 2, transform=np.fft.rfft).T
        for signal in signal_list
    ])

    # Frequency Range
    freq_range = [0, 250]

    # Construct the new DOA object
    doa = pra.doa.algorithms[algorithm_name](
        L=m,
        fs=fs,
        nfft=nfft,
        c=speed_of_sound,
        num_src=num_sources,
        max_four=4,
        dim=3,
        azimuth=np.linspace(-180., 180., 360) * np.pi / 180,
        colatitude=np.linspace(-90., 90., 180) * np.pi / 180)

    # Locate the sources
    doa.locate_sources(x, freq_range=freq_range)

    # Return all in radians
    return doa.azimuth_recon, doa.colatitude_recon
コード例 #6
0
    def test_stft_nowindow(self):
        frames = 100
        fftsize = [128, 256, 512]
        hop_div = [1, 2]
        loops = 10

        for n in fftsize:
            for div in hop_div:
                for epoch in range(loops):
                    x = np.random.randn(frames * n // div + n - n // div)
                    X = pra.stft(x, n, n // div, transform=np.fft.rfft)
                    y = pra.istft(X, n, n // div, transform=np.fft.irfft)

                    # because of overlap, there is a scaling at reconstruction
                    y[n // div:-n // div] /= div
                    self.assertTrue(np.allclose(x, y))
コード例 #7
0
    def get_difference_of_arrivals(self, signal_list, *mic_location):
        """Returns an azimuth and co-latitude for each pair of
           microphones combinations. Note: all angles are returned in radians

            Args:
                signal_list: (list) microphones signals
                *mic_location: (list) location of each microphone

            Returns:
                 doa.azimuth_recon: (float) Azimuth angle
                 doa.colatitude_recon: (float) Co-latitude angle
        """

        # Add n-microphone array in [x,y,z] order
        m = np.vstack(list(zip(*mic_location)))

        # TODO: Figure out this deprecation
        # Create an array of a short fourier transformed frequency signal
        if self.transform:
            x = np.array([
                pra.stft(signal,
                         self.fft_size,
                         self.fft_size // 2,
                         transform=np.fft.rfft).T for signal in signal_list
            ])
        else:
            x = np.array([
                pra.transform.stft.analysis(signal, self.fft_size,
                                            self.fft_size // 2).T
                for signal in signal_list
            ])

        # Construct the new DOA object
        doa = pra.doa.algorithms.get(self.algo_name)(
            L=m,
            fs=self.fs,
            nfft=self.fft_size,
            c=self.sound_speed,
            num_src=self.num_sources,
            max_four=4,
            dim=3,
            azimuth=np.linspace(-180., 180., 360) * np.pi / 180,
            colatitude=np.linspace(-90., 90., 180) * np.pi / 180)

        doa.locate_sources(x, freq_range=self.freq_range)

        return doa.azimuth_recon, doa.colatitude_recon
コード例 #8
0
    def plot(self, L=512, hop=128, zpb=0, phonems=False, **kwargs):

        try:
            import matplotlib.pyplot as plt
            import seaborn as sns
        except ImportError:
            return

        sns.set_style("white")
        X = stft(
            self.data,
            L=L,
            hop=hop,
            zp_back=zpb,
            transform=np.fft.rfft,
            win=np.hanning(L + zpb),
        )
        X = 10 * np.log10(np.abs(X)**2).T

        plt.imshow(X, origin="lower", aspect="auto")

        ticks = []
        ticklabels = []

        if phonems:
            for phonem in self.phonems:
                plt.axvline(x=phonem["bnd"][0] / hop)
                plt.axvline(x=phonem["bnd"][1] / hop)
                ticks.append((phonem["bnd"][1] + phonem["bnd"][0]) / 2 / hop)
                ticklabels.append(phonem["name"])

        else:
            for word in self.words:
                plt.axvline(x=word.boundaries[0] / hop)
                plt.axvline(x=word.boundaries[1] / hop)
                ticks.append(
                    (word.boundaries[1] + word.boundaries[0]) / 2 / hop)
                ticklabels.append(word.word)

        plt.xticks(ticks, ticklabels, rotation=-45)
        plt.yticks([], [])
        plt.tick_params(axis="both", which="major", labelsize=14)
コード例 #9
0
    def difference_of_arrivals(self, signal_list, *mic_location):
        """Returns an azimuth and co-latitude for each pair of
           microphones combinations. Note: all angles are returned in radians

            Args:
                signal_list: (list) microphones signals
                *mic_location: (list) location of each microphone

            Returns:
                 doa.azimuth_recon: (float) Azimuth angle
                 doa.colatitude_recon: (float) Co-latitude angle

            Raises:
                ValueError: Signal list is empty
                ValueError: None in Signal list
                ValueError: Microphone list is empty
                ValueError: None in microphone list
        """
        print(type(signal_list))
        print(signal_list)
        if not signal_list:
            raise ValueError('Error. Signal list is empty.')

        if np.array(signal_list).shape[0] == 1 and None in signal_list:
            raise ValueError('Error. None in signal list.')

        # This works for lists of lists, but not for single list
        if any([True for signal in signal_list if None in signal]):
            raise ValueError('Error. None in signal list.')

        if not mic_location:
            raise ValueError('Error. Microphone location list is empty.')

        if None in mic_location:
            raise ValueError(
                'Error. None in microphone location list is empty.')

        # Add n-microphone array in [x,y,z] order
        m = np.vstack(list(zip(*mic_location)))

        # TODO: Figure out this deprecation
        # Create an array of a short fourier transformed frequency signal
        if self.transform:
            x = np.array([
                pra.stft(signal,
                         self.fft_size,
                         self.fft_size // 2,
                         transform=np.fft.rfft).T for signal in signal_list
            ])
        else:
            x = np.array([
                pra.transform.stft.analysis(signal, self.fft_size,
                                            self.fft_size // 2).T
                for signal in signal_list
            ])

        # Construct the new DOA object
        doa = pra.doa.algorithms.get(self.algo_name)(
            L=m,
            fs=self.fs,
            nfft=self.fft_size,
            c=self.sound_speed,
            num_src=self.num_sources,
            max_four=4,
            dim=3,
            azimuth=np.linspace(-180., 180., 360) * np.pi / 180,
            colatitude=np.linspace(-90., 90., 180) * np.pi / 180)

        doa.locate_sources(x, freq_range=self.freq_range)

        return doa.azimuth_recon, doa.colatitude_recon
コード例 #10
0
def test_sparseauxiva():
    fs = 16000

    signals = [
        np.concatenate([
            wavfile.read(f)[1].astype(np.float32, order='C')
            for f in source_files
        ]) for source_files in wav_files
    ]

    wavfile.write('sample1.wav', fs, np.asarray(signals[0], dtype=np.int16))
    wavfile.write('sample2.wav', fs, np.asarray(signals[1], dtype=np.int16))

    # Define an anechoic room envrionment, as well as the microphone array and source locations.

    # Room 4m by 6m
    room_dim = [8, 9]
    # source locations and delays
    locations = [[2.5, 3], [2.5, 6]]
    delays = [1., 0.]
    # create an anechoic room with sources and mics
    room = pra.ShoeBox(room_dim,
                       fs=16000,
                       max_order=15,
                       absorption=0.35,
                       sigma2_awgn=1e-8)

    # add mic and good source to room
    # Add silent signals to all sources
    for sig, d, loc in zip(signals, delays, locations):
        room.add_source(loc, signal=np.zeros_like(sig), delay=d)

    # add microphone array

    room.add_microphone_array(
        pra.MicrophoneArray(np.c_[[6.5, 4.49], [6.5, 4.51]], room.fs))

    # Compute the RIRs as in the Room Impulse Response generation section.

    # compute RIRs
    room.compute_rir()

    # Record each source separately

    separate_recordings = []
    for source, signal in zip(room.sources, signals):
        source.signal[:] = signal
        room.simulate()
        separate_recordings.append(room.mic_array.signals)
        source.signal[:] = 0.
    separate_recordings = np.array(separate_recordings)

    # Mix down the recorded signals
    mics_signals = np.sum(separate_recordings, axis=0)

    # save mixed signals as wav files
    wavfile.write('mix1.wav', fs, np.asarray(mics_signals[0].T,
                                             dtype=np.int16))
    wavfile.write('mix2.wav', fs, np.asarray(mics_signals[1].T,
                                             dtype=np.int16))
    wavfile.write(
        'mix1_norm.wav', fs,
        np.asarray(mics_signals[0].T / np.max(np.abs(mics_signals[0].T)) *
                   32767,
                   dtype=np.int16))
    wavfile.write(
        'mix2_norm.wav', fs,
        np.asarray(mics_signals[1].T / np.max(np.abs(mics_signals[1].T)) *
                   32767,
                   dtype=np.int16))

    # STFT frame length
    L = 2048

    # START BSS
    ###########

    # Preprocessing
    # Observation vector in the STFT domain
    X = np.array([
        pra.stft(ch,
                 L,
                 L,
                 transform=np.fft.rfft,
                 zp_front=L // 2,
                 zp_back=L // 2) for ch in mics_signals
    ])
    X = np.moveaxis(X, 0, 2)

    # Reference signal to calculate performance of BSS
    ref = np.moveaxis(separate_recordings, 1, 2)

    ratio = 0.35
    average = np.abs(np.mean(np.mean(X, axis=2), axis=0))
    k = np.int_(average.shape[0] * ratio)
    S = np.argpartition(average, -k)[-k:]
    S = np.sort(S)
    n_iter = 30

    # Run SparseAuxIva
    Y = pra.bss.sparseauxiva(X, S, n_iter, lasso=True)

    # run iSTFT
    y = np.array([
        pra.istft(Y[:, :, ch],
                  L,
                  L,
                  transform=np.fft.irfft,
                  zp_front=L // 2,
                  zp_back=L // 2) for ch in range(Y.shape[2])
    ])

    # Compare SIR and SDR with our reference signal
    sdr, isr, sir, sar, perm = bss_eval_images(
        ref[:, :y.shape[1] - L // 2, 0], y[:, L // 2:ref.shape[1] + L // 2])
    print('SDR: {0}, SIR: {1}'.format(sdr, sir))

    wavfile.write('demix1.wav', fs, np.asarray(y[0].T, dtype=np.int16))
    wavfile.write('demix2.wav', fs, np.asarray(y[1].T, dtype=np.int16))
    wavfile.write(
        'demix1_norm.wav', fs,
        np.asarray(y[0].T / np.max(np.abs(y[0].T)) * 32767, dtype=np.int16))
    wavfile.write(
        'demix2_norm.wav', fs,
        np.asarray(y[1].T / np.max(np.abs(y[1].T)) * 32767, dtype=np.int16))
コード例 #11
0
ファイル: srp-phat.py プロジェクト: zcy618/SRP-PHAT
def srp_phat(s,
             fs,
             nFFT=None,
             center=None,
             d=None,
             azimuth_estm=None,
             mode=None):
    '''
    Applies Steered Power Response with phase transform algorithm
    Uses pyroomacoustics module
    
    Input params
    ------------
    s: numpy array
        -stacked microphone array signals 
        (NOTE: number of microphones is extracted from the size of input signal,
         since the input signal will be of size MxN where M is number of microphones
         and N is the length of the audio signal.)
    fs: int
        -Sampling frequency
    nfft: int
        -FFT size. Default 1024
    center: numpy array
        -Defines the center of the room. Default [0,0]
    d: int
        -Distance between microphones. Default 10cm.
    azimuth_estm: numpy array
        -Candidate azimuth estimates, representing location estimates of speakers.
         Default expects microphone to be in the middle of a table and speakers located around it.
         Assumes two speakers - [60,120]
    mode: str
        -Defines the microphone setup layout. Default mode = linear.
        mode = linear 
        mode = circular
    '''
    if nFFT is None:
        nFFT = 1024
    if center is None:
        center = [0, 0]
    if d is None:
        d = 0.1
    if azimuth_estm is None:
        azimuth_estm = [60, 120]

    freq_bins = np.arange(
        30, 330)  #list of individual frequency bins used to run DoA
    M = s.shape[0]  #number of microphones
    phi = 0  #assume angle between microphones is 0 (same y-axis)
    radius = d * M / (2 * np.pi)  #define radius for circular microphone layout
    c = 343.0  #speed of sound

    #Define Microphone array layout
    if mode is 'circular':
        L = pra.circular_2D_array(center, M, phi, radius)
    if mode is None or 'linear':
        L = pra.linear_2D_array(center, M, phi, d)

    nSrc = len(azimuth_estm)  #number of speakers

    #STFT
    s_FFT = np.array([
        pra.stft(s, nFFT, nFFT // 2, transform=np.fft.rfft).T for s in s
    ])  #STFT for s1 and s2

    #SRP
    doa = pra.doa.srp.SRP(L, fs, nFFT, c, max_four=4,
                          num_src=nSrc)  #perform SRP approximation
    #Apply SRP-PHAT
    doa.locate_sources(s_FFT, freq_bins=freq_bins)

    #PLOTTING
    doa.polar_plt_dirac()
    plt.title('SRP-PHAT')
    print('SRP-PHAT')
    print('Speakers at: ', np.sort(doa.azimuth_recon) / np.pi * 180, 'degrees')
    plt.show()
コード例 #12
0
def test_bss(algo, L):

    # Room dimensions in meters
    room_dim = [8, 9]

    # create a room with sources and mics
    room = pra.ShoeBox(room_dim, fs=16000, max_order=0, sigma2_awgn=1e-8)

    # get signals
    signals = [
        np.concatenate(
            [wavfile.read(f)[1].astype(np.float32) for f in source_files])
        for source_files in wav_files
    ]
    delays = [1., 0.]
    locations = [[2.5, 3], [2.5, 6]]

    # add mic and good source to room
    # Add silent signals to all sources
    for sig, d, loc in zip(signals, delays, locations):
        room.add_source(loc, signal=np.zeros_like(sig), delay=d)

    # add microphone array
    room.add_microphone_array(
        pra.MicrophoneArray(np.c_[[6.5, 4.49], [6.5, 4.51]], fs=room.fs))

    # compute RIRs
    room.compute_rir()

    # Record each source separately
    separate_recordings = []
    for source, signal in zip(room.sources, signals):

        source.signal[:] = signal

        room.simulate()
        separate_recordings.append(room.mic_array.signals)

        source.signal[:] = 0.
    separate_recordings = np.array(separate_recordings)

    # Mix down the recorded signals
    mics_signals = np.sum(separate_recordings, axis=0)

    ## STFT analysis
    # shape == (n_chan, n_frames, n_freq)
    X = pra.transform.analysis(mics_signals.T,
                               L,
                               L,
                               zp_front=L // 2,
                               zp_back=L // 2,
                               bits=64)

    X_test = np.array([
        pra.stft(ch,
                 L,
                 L,
                 transform=np.fft.rfft,
                 zp_front=L // 2,
                 zp_back=L // 2) for ch in mics_signals
    ])
    X_test = np.moveaxis(X_test, 0, 2)
    ## START BSS
    if choices[algo] == 'auxIVA':
        # Run AuxIVA
        Y = pra.bss.auxiva(X, n_iter=30, proj_back=True)
        max_mse = 1e-5
    elif choices[algo] == 'ILRMA':
        # Run ILRMA
        Y = pra.bss.ilrma(X, n_iter=30, n_components=30, proj_back=True)
        max_mse = 1e-5
    elif choices[algo] == 'sparseauxIVA':
        # Estimate set of active frequency bins
        ratio = 0.35
        average = np.abs(np.mean(np.mean(X, axis=2), axis=0))
        k = np.int_(average.shape[0] * ratio)
        S = np.sort(np.argpartition(average, -k)[-k:])
        # Run SparseAuxIva
        Y = pra.bss.sparseauxiva(X, S, n_iter=30, proj_back=True)
        max_mse = 1e-4

    ## STFT Synthesis
    y = pra.transform.synthesis(Y,
                                L,
                                L,
                                zp_front=L // 2,
                                zp_back=L // 2,
                                bits=64).T

    # Calculate MES
    #############
    ref = np.moveaxis(separate_recordings, 1, 2)
    y_aligned = y[:, L // 2:ref.shape[1] + L // 2]

    mse = np.mean((ref[:, :y_aligned.shape[1], 0] - y_aligned)**2)
    input_variance = np.var(np.concatenate(signals))

    print(
        '%s with frame length of %d: Relative MSE (expected less than %.e)' %
        (choices[algo], L, max_mse), mse / input_variance)
    assert (mse / input_variance) < max_mse
コード例 #13
0
        for s in speech_signals.T:
            s[:] = pra.highpass(s, fs, fc=150.)
        for s in silence.T:
            s[:] = pra.highpass(s, fs, fc=150.)

    # Normalize the amplitude
    n_factor = 0.95 / np.max(np.abs(speech_signals))
    speech_signals *= n_factor
    silence *= n_factor

    # estimate noise floor
    y_noise_stft = []
    for k in range(num_mic):
        y_stft = pra.stft(silence[:, k],
                          fft_size,
                          frame_shift_step,
                          transform=rfft,
                          win=win_stft).T / np.sqrt(fft_size)
        y_noise_stft.append(y_stft)
    y_noise_stft = np.array(y_noise_stft)
    noise_floor = np.mean(np.abs(y_noise_stft)**2)

    # estimate SNR in dB (on 1st microphone)
    noise_var = np.mean(np.abs(silence)**2)
    sig_var = np.mean(np.abs(speech_signals)**2)
    # rought estimate of SNR
    SNR = 10 * np.log10((sig_var - noise_var) / noise_var)
    print('Estimated SNR: ' + str(SNR))

    # Compute DFT of snapshots
    # -------------------------
コード例 #14
0
def beamformed_doa_plot(comb):
    f1_data = f1['data']
    f2_data = f2['data']

    # azimuth = np.array([math.atan2(1.5, 0.5), math.atan2(1.5, -0.5)])
    azimuth = np.array([
        90.,
        270.,
    ]) * np.pi / 180
    distance = 1.5

    c = 343.  # speed of sound
    fs = 16000  # sampling frequency
    nfft = 256  # FFT size
    freq_range = [300, 400]
    sr = 16000
    snr_db = 5.  # signal-to-noise ratio
    # sigma2 = 10**(-snr_db / 10) / (4. * np.pi * distance)**2

    # Add sources of 1 second duration
    rng = np.random.RandomState(23)
    duration_samples = int(sr)

    room_dim = np.r_[4., 6.]
    room = pra.ShoeBox(room_dim, fs=sr)

    echo = pra.linear_2D_array(center=(room_dim / 2), M=5, phi=0, d=0.5)
    room.add_microphone_array(pra.MicrophoneArray(echo, room.fs))
    # R = pra.linear_2D_array([2, 1.5], 4, 0, 0.04)

    # source_location = room_dim / 2 + distance * np.r_[np.cos(ang), np.sin(ang)]
    # source_signal = rng.randn(duration_samples)
    # room.add_source(source_location, signal=source_signal)

    # room.add_source(np.array([1.5, 4.5]), delay=0., signal=f1_data)
    # room.add_source(np.array([2.5, 4.5]), delay=0., signal=f2_data[:len(f1_data)])

    for ang in azimuth:
        source_location = room_dim / 2 + distance * np.r_[np.cos(ang),
                                                          np.sin(ang)]
        source_signal = rng.randn(duration_samples)
        room.add_source(source_location, signal=source_signal)

    room.simulate()

    X = np.array([
        pra.stft(signal, nfft, nfft // 2, transform=np.fft.rfft).T
        for signal in room.mic_array.signals
    ])

    # DOA_algorithm = 'MUSIC'
    # spatial_resp = dict()

    doa = pra.doa.algorithms['MUSIC'](echo,
                                      fs,
                                      nfft,
                                      c=c,
                                      num_src=2,
                                      max_four=4)

    # this call here perform localization on the frames in X
    doa.locate_sources(X, freq_range=freq_range)

    spatial_resp = doa.grid.values

    # normalize
    min_val = spatial_resp.min()
    max_val = spatial_resp.max()
    spatial_resp = (spatial_resp - min_val) / (max_val - min_val)

    # plotting param
    base = 1.
    height = 10.
    true_col = [0, 0, 0]

    # loop through algos
    phi_plt = doa.grid.azimuth

    # plot
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='polar')
    c_phi_plt = np.r_[phi_plt, phi_plt[0]]
    c_dirty_img = np.r_[spatial_resp, spatial_resp[0]]
    ax.plot(
        c_phi_plt,
        base + height * c_dirty_img,
        linewidth=3,
        alpha=0.55,
        linestyle='-',
        # label="spatial spectrum"
    )
    # plt.title('MUSIC')

    # plot true loc
    # for angle in azimuth:
    #     ax.plot([angle, angle], [base, base + height], linewidth=3, linestyle='--',
    #         color=true_col, alpha=0.6)
    # K = len(azimuth)
    # ax.scatter(azimuth, base + height*np.ones(K), c=np.tile(true_col,
    #            (K, 1)), s=500, alpha=0.75, marker='*',
    #            linewidths=0,
    #            # label='true locations'
    #            )

    plt.legend()
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles,
              labels,
              framealpha=0.5,
              scatterpoints=1,
              loc='center right',
              fontsize=16,
              ncol=1,
              bbox_to_anchor=(1.6, 0.5),
              handletextpad=.2,
              columnspacing=1.7,
              labelspacing=0.1)

    ax.set_xticks(np.linspace(0, 2 * np.pi, num=12, endpoint=False))
    ax.xaxis.set_label_coords(0.5, -0.11)
    ax.set_yticks(np.linspace(0, 1, 2))
    ax.xaxis.grid(b=True, color=[0.3, 0.3, 0.3], linestyle=':')
    ax.yaxis.grid(b=True, color=[0.3, 0.3, 0.3], linestyle='--')
    ax.set_ylim([0, 1.05 * (base + height)])

    plt.show()
コード例 #15
0
source_location = room_dim / 2 + distance * np.r_[np.cos(azimuth),
                                                  np.sin(azimuth)]
source_signal = np.random.randn((nfft // 2 + 1) * nfft)
aroom.add_source(source_location, signal=source_signal)

# We use a circular array with radius 15 cm # and 12 microphones
R = pra.circular_2D_array(room_dim / 2, 12, 0., 0.15)
aroom.add_microphone_array(pra.MicrophoneArray(R, fs=aroom.fs))

# run the simulation
aroom.simulate()

################################
# Compute the STFT frames needed
X = np.array([
    pra.stft(signal, nfft, nfft // 2, transform=np.fft.rfft).T
    for signal in aroom.mic_array.signals
])

##############################################
# Now we can test all the algorithms available
algo_names = sorted(pra.doa.algorithms.keys())

for algo_name in algo_names:
    # Construct the new DOA object
    # the max_four parameter is necessary for FRIDA only
    doa = pra.doa.algorithms[algo_name](R, fs, nfft, c=c, max_four=4)

    # this call here perform localization on the frames in X
    doa.locate_sources(X, freq_bins=freq_bins)
コード例 #16
0
def parallel_loop(filename, algo_names, pmt):
    '''
    This is one loop of the computation
    extracted for parallelization
    '''

    # We need to do a bunch of imports
    import pyroomacoustics as pra
    import os
    import numpy as np
    from scipy.io import wavfile
    import mkl as mkl_service
    import copy

    import doa
    from tools import rfft

    # for such parallel processing, it is better 
    # to deactivate multithreading in mkl
    mkl_service.set_num_threads(1)

    # exctract the speaker names from filename
    name = os.path.splitext(os.path.basename(filename))[0]
    sources = name.split('-')

    # number of sources
    K = len(sources)

    # Import speech signal
    fs_file, rec_signals = wavfile.read(filename)

    # sanity check
    if pmt['fs'] != fs_file:
        raise ValueError('The sampling frequency of the files doesn''t match that of the script')
    
    speech_signals = np.array(rec_signals[:,pmt['mic_select']], dtype=np.float32)

    # Remove the DC bias
    for s in speech_signals.T:
        s[:] = pra.highpass(s, pmt['fs'], 100.)

    if pmt['stft_win']:
        stft_win = np.hanning(pmt['nfft'])
    else:
        stft_win = None

    # Normalize the amplitude
    speech_signals *= pmt['scaling']

    # Compute STFT of signal
    # -------------------------
    y_mic_stft = []
    for k in range(speech_signals.shape[1]):
        y_stft = pra.stft(speech_signals[:, k], pmt['nfft'], pmt['stft_hop'],
                          transform=rfft, win=stft_win).T / np.sqrt(pmt['nfft'])
        y_mic_stft.append(y_stft)
    y_mic_stft = np.array(y_mic_stft)

    # estimate SNR in dB (on 1st microphone)
    sig_var = np.var(speech_signals)
    SNR = 10*np.log10( (sig_var - pmt['noise_var']) / pmt['noise_var'] )

    freq_bins = copy.copy(pmt['freq_bins'][K-1])

    # dict for output
    phi_recon = {}

    for alg in algo_names:

        # Use the convenient dictionary of algorithms defined
        d = doa.algos[alg](
                L=pmt['mic_array'], 
                fs=pmt['fs'], 
                nfft=pmt['nfft'], 
                num_src=K, 
                c=pmt['c'], 
                theta=pmt['phi_grid'], 
                max_four=pmt['M'], 
                num_iter=pmt['num_iter'],
                G_iter = pmt['G_iter']
                )

        # perform localization
        d.locate_sources(y_mic_stft, freq_bins=freq_bins[alg])

        # store result
        phi_recon[alg] = d.phi_recon

    return SNR, sources, phi_recon
コード例 #17
0
ファイル: generators.py プロジェクト: zzz9999/FRIDA
def gen_sig_at_mic_stft(phi_ks,
                        alpha_ks,
                        mic_array_coord,
                        SNR,
                        fs,
                        fft_size=1024,
                        Ns=256):
    """
    generate microphone signals with short time Fourier transform
    :param phi_ks: azimuth of the acoustic sources
    :param alpha_ks: power of the sources
    :param mic_array_coord: x and y coordinates of the microphone array
    :param SNR: signal to noise ratio at the microphone
    :param fs: sampling frequency
    :param fft_size: number of FFT bins
    :param Ns: number of time snapshots used to estimate covariance matrix
    :return: y_hat_stft: received (complex) signal at microphones
             y_hat_stft_noiseless: the noiseless received (complex) signal at microphones
    """
    frame_shift_step = np.int(fft_size /
                              1.)  # half block overlap for adjacent frames
    K = alpha_ks.shape[0]  # number of point sources
    num_mic = mic_array_coord.shape[1]  # number of microphones

    # Generate the impulse responses for the array and source directions
    impulse_response = gen_far_field_ir(np.reshape(phi_ks, (1, -1), order='F'),
                                        mic_array_coord, fs)

    # Now generate some noise
    # source_signal = np.random.randn(K, Ns * fft_size) * np.sqrt(alpha_ks[:, np.newaxis])
    source_signal = np.random.randn(K, fft_size + (Ns - 1) * frame_shift_step) * \
                    np.sqrt(np.reshape(alpha_ks, (-1, 1), order='F'))

    # Now generate all the microphone signals
    y = np.zeros(
        (num_mic, source_signal.shape[1] + impulse_response.shape[2] - 1),
        dtype=np.float32)
    for src in xrange(K):
        for mic in xrange(num_mic):
            y[mic] += fftconvolve(impulse_response[src, mic],
                                  source_signal[src])

    # Now do the short time Fourier transform
    # The resulting signal is M x fft_size/2+1 x number of frames
    y_hat_stft_noiseless = \
        np.array([pra.stft(signal, fft_size, frame_shift_step, transform=mkl_fft.rfft).T
                  for signal in y]) / np.sqrt(fft_size)

    # compute noise variance based on SNR
    signal_energy = linalg.norm(y_hat_stft_noiseless.flatten())**2
    noise_energy = signal_energy / 10**(SNR * 0.1)
    sigma2_noise = noise_energy / y_hat_stft_noiseless.size

    # Add noise to the signals
    y_noisy = y + np.sqrt(sigma2_noise) * np.array(np.random.randn(*y.shape),
                                                   dtype=np.float32)

    y_hat_stft = \
        np.array([pra.stft(signal, fft_size, frame_shift_step, transform=mkl_fft.rfft).T
                  for signal in y_noisy]) / np.sqrt(fft_size)

    return y_hat_stft, y_hat_stft_noiseless
dSNR = pra.dB(room1.dSNR(mics.center[:,0], source=0), power=True)
print 'The direct SNR for good source is ' + str(dSNR)

# remove a bit of signal at the end
n_lim = np.ceil(len(input_mic) - t_cut*Fs)
input_clean = signal1[:n_lim]
input_mic = input_mic[:n_lim]
out_DirectMVDR = out_DirectMVDR[:n_lim]
out_RakeMVDR = out_RakeMVDR[:n_lim]
out_DirectPerceptual = out_DirectPerceptual[:n_lim]
out_RakePerceptual = out_RakePerceptual[:n_lim]


# compute time-frequency planes
F0 = pra.stft(input_clean, fft_size, fft_hop, 
          win=analysis_window, 
          zp_back=fft_zp)
F1 = pra.stft(input_mic, fft_size, fft_hop, 
          win=analysis_window, 
          zp_back=fft_zp)
F2 = pra.stft(out_DirectMVDR, fft_size, fft_hop, 
          win=analysis_window, 
          zp_back=fft_zp)
F3 = pra.stft(out_RakeMVDR, fft_size, fft_hop, 
          win=analysis_window, 
          zp_back=fft_zp)
F4 = pra.stft(out_DirectPerceptual, fft_size, fft_hop, 
          win=analysis_window, 
          zp_back=fft_zp)
F5 = pra.stft(out_RakePerceptual, fft_size, fft_hop, 
          win=analysis_window, 
コード例 #19
0
    female_speakers_train = list(set([s.speaker for s in filter(lambda x: x.sex == 'F', corpus.sentence_corpus['TRAIN'])]))

    print('Pick a subset of', n_speakers, 'speakers')
    training_set_speakers = male_speakers_train[:n_speakers] + female_speakers_train[:n_speakers]
    print(training_set_speakers)

    # compute all the spectrograms
    print('Compute all the spectrograms')
    window = np.sqrt(pra.cosine(stft_win_len))  # use sqrt because of synthesis
    training_set = dict()
    testing_set = dict()
    for speaker in training_set_speakers:
        training_set_sentences = filter(lambda x: x.speaker == speaker, corpus.sentence_corpus['TRAIN'])
        # X is (n_sentences, n_channel, n_frame)
        x = list()
        X = list()
        for sentence in training_set_sentences:
            print(sentence.speaker, sentence.id,)
            x.append(sentence.samples)
            X.append(pra.stft(sentence.samples, stft_win_len, stft_win_len // 2, win=window, transform=np.fft.rfft).T)
        # TRAIN:
        # Dalia says the magnitude works better...
        training_set[speaker] = np.concatenate([np.abs(spectrogram)**2 for spectrogram in X[0:9]], axis=1)
        # TEST:
        testing_set[speaker] = x[-1]

    print('Train the dictionary...')
    W_dictionary = nmf_train(training_set, n_latent_variables, n_iter=n_iter)
    W_dictionary /= np.sum(W_dictionary, axis=0)[None,:]
    np.savez('W_dictionary_em.npz', speakers=list(training_set.keys()), W_dictionary=W_dictionary, testing_set=testing_set)
コード例 #20
0
# channels[1] = raw_channels[2]
# channels[2] = raw_channels[1]
# channels = raw_data.T[::-1]

c = 13503.94  # speed of sound in inches/second
nfft = 256  # FFT size
freq_range = [300, 3500]

mic_positions = pra.circular_2D_array(center=(0, 0),
                                      M=3,
                                      phi0=-math.pi / 6,
                                      radius=8.66)
print(mic_positions)

X = np.array([
    pra.stft(channel, nfft, nfft // 2, transform=np.fft.rfft).T
    for channel in channels
])

doa = pra.doa.algorithms["MUSIC"](mic_positions,
                                  fs,
                                  nfft,
                                  c=c,
                                  num_src=1,
                                  max_four=4)
doa.locate_sources(X, freq_range=freq_range)
# IPython.embed()

spatial_resp = doa.grid.values
phi_plt = doa.grid.azimuth
dSNR = pra.dB(room1.dSNR(mics.center[:, 0], source=0), power=True)
print 'The direct SNR for good source is ' + str(dSNR)

# remove a bit of signal at the end
n_lim = np.ceil(len(input_mic) - t_cut * Fs)
input_clean = signal1[:n_lim]
input_mic = input_mic[:n_lim]
out_DirectMVDR = out_DirectMVDR[:n_lim]
out_RakeMVDR = out_RakeMVDR[:n_lim]
out_DirectPerceptual = out_DirectPerceptual[:n_lim]
out_RakePerceptual = out_RakePerceptual[:n_lim]

# compute time-frequency planes
F0 = pra.stft(input_clean,
              fft_size,
              fft_hop,
              win=analysis_window,
              zp_back=fft_zp)
F1 = pra.stft(input_mic,
              fft_size,
              fft_hop,
              win=analysis_window,
              zp_back=fft_zp)
F2 = pra.stft(out_DirectMVDR,
              fft_size,
              fft_hop,
              win=analysis_window,
              zp_back=fft_zp)
F3 = pra.stft(out_RakeMVDR,
              fft_size,
              fft_hop,
コード例 #22
0
def run_at_azim(azim):
    azim_binned = azim / 5
    soruce_files = glob(
        '/om/user/francl/recorded_binaural_audio_4078_main_kemar_rescaled/*_{}_azim.wav'
        .format(azim))
    df = pd.DataFrame(
        columns=["azim", "predicted", "algorithm", "source_name"])
    for fname in soruce_files[:7]:
        freq, stim = read(fname)
        source_name = os.path.basename(fname)
        stim = stim.T
        X = np.array([
            pra.stft(signal, nfft, nfft // 2, transform=np.fft.rfft).T
            for signal in stim
        ])

        algo_names = ['SRP', 'MUSIC', 'TOPS', 'CSSM', 'WAVES']
        spatial_resp = dict()

        microphone = np.array([[0 - (mic_offset * 0.01) / 2.0, 0],
                               [0 + (mic_offset * 0.01) / 2.0, 0]]).T
        # loop through algos
        for algo_name in algo_names:
            # Construct the new DOA object
            # the max_four parameter is necessary for FRIDA only
            doa = pra.doa.algorithms[algo_name](microphone,
                                                fs,
                                                nfft,
                                                c=c,
                                                num_src=1,
                                                max_four=4,
                                                n_grid=72)

            # this call here perform localization on the frames in X
            doa.locate_sources(X, freq_range=freq_range)

            # store spatial response
            if algo_name is 'FRIDA':
                spatial_resp[algo_name] = np.abs(doa._gen_dirty_img())
            else:
                spatial_resp[algo_name] = doa.grid.values

            # normalize
            min_val = spatial_resp[algo_name].min()
            max_val = spatial_resp[algo_name].max()
            spatial_resp[algo_name] = (spatial_resp[algo_name] -
                                       min_val) / (max_val - min_val)
        for k, v in spatial_resp.items():
            rolled_response = np.roll(v, -18)
            predicted = rolled_response.argmax()
            predicted_folded = fold_locations_full_dist_5deg(
                rolled_response).argmax()
            predicted_folded_rolled = add_fold_offset(predicted_folded,
                                                      predicted, azim_binned)
            df = df.append(
                {
                    "predicted_folded": predicted_folded_rolled,
                    "azim": azim_binned,
                    "predicted": predicted,
                    "algorithm": k,
                    "source_name": source_name
                },
                ignore_index=True)
    return df
コード例 #23
0
room.simulate()

# sound-to-light sensor
# we assume there is no propagation delay between speaker and sensor
leds = LightArray2(src_loc, fs=fs_light)
leds.record(target_audio + np.random.randn(*target_audio.shape) * sigma_n, fs=fs_sound)
leds_sig = leds.signals - leds.signals.min()
leds_sig /= leds_sig.max()
leds_time = np.arange(leds.signals.shape[0]) / fs_light

# perform VAD on the light signal
vad = leds.signals > vad_thresh

# Now compute the STFT of the microphone input
X = np.moveaxis([ pra.stft(a, nfft, nfft//2, np.fft.rfft, win=pra.hann(nfft)) for a in room.mic_array.signals ], 0, -1)
X_time = np.arange(1, X.shape[0]+1) * (nfft / 2) / fs_sound

# we need to match the VAD to sampling rate of X
vad_x = np.zeros(X_time.shape[0], dtype=bool)
v = 0
for i,t in enumerate(X_time):
    if v < leds_time.shape[0] - 1 and  abs(t - leds_time[v]) > abs(t - leds_time[v+1]):
        v += 1
    vad_x[i] = vad[v]
vad_x_comp = np.logical_not(vad_x)

# covariance matrix
Rs = np.einsum('i...j,i...k->...jk', X[vad_x,:,:], np.conj(X[vad_x,:,:])) / np.sum(vad_x)
Rn = np.einsum('i...j,i...k->...jk', X[vad_x_comp,:,:], np.conj(X[vad_x_comp,:,:])) / np.sum(vad_x_comp)
コード例 #24
0
ファイル: bss_iva.py プロジェクト: xinkez/pyroomacoustics
        ])
        sdr, sir, sar, perm = bss_eval_sources(
            ref[:, :y.shape[1] - L // 2, 0], y[:,
                                               L // 2:ref.shape[1] + L // 2])
        SDR.append(sdr)
        SIR.append(sir)

    # START BSS
    ###########
    # The STFT needs front *and* back padding

    # shape == (n_chan, n_frames, n_freq)
    X = np.array([
        pra.stft(ch,
                 L,
                 L,
                 transform=np.fft.rfft,
                 zp_front=L // 2,
                 zp_back=L // 2) for ch in mics_signals
    ])
    X = np.moveaxis(X, 0, 2)

    # Run AuxIVA
    Y = pra.bss.auxiva(X,
                       n_iter=30,
                       proj_back=True,
                       callback=convergence_callback)

    # run iSTFT
    y = np.array([
        pra.istft(Y[:, :, ch],
                  L,
コード例 #25
0
def test_ilrma():

    # STFT frame length
    L = 256

    # Room 4m by 6m
    room_dim = [8, 9]

    # source location
    source = np.array([1, 4.5])

    # create an anechoic room with sources and mics
    room = pra.ShoeBox(room_dim, fs=16000, max_order=0, sigma2_awgn=1e-8)

    # get signals
    signals = [
        np.concatenate(
            [wavfile.read(f)[1].astype(np.float32) for f in source_files])
        for source_files in wav_files
    ]
    delays = [1., 0.]
    locations = [[2.5, 3], [2.5, 6]]

    # add mic and good source to room
    # Add silent signals to all sources
    for sig, d, loc in zip(signals, delays, locations):
        room.add_source(loc, signal=np.zeros_like(sig), delay=d)

    # add microphone array
    room.add_microphone_array(
        pra.MicrophoneArray(np.c_[[6.5, 4.49], [6.5, 4.51]], fs=room.fs))

    # compute RIRs
    room.compute_rir()

    # Record each source separately
    separate_recordings = []
    for source, signal in zip(room.sources, signals):

        source.signal[:] = signal

        room.simulate()
        separate_recordings.append(room.mic_array.signals)

        source.signal[:] = 0.
    separate_recordings = np.array(separate_recordings)

    # Mix down the recorded signals
    mics_signals = np.sum(separate_recordings, axis=0)

    # START BSS
    ###########

    # shape == (n_chan, n_frames, n_freq)
    X = np.array([
        pra.stft(ch,
                 L,
                 L,
                 transform=np.fft.rfft,
                 zp_front=L // 2,
                 zp_back=L // 2) for ch in mics_signals
    ])
    X = np.moveaxis(X, 0, 2)

    # Run ILRMA
    Y = pra.bss.ilrma(X, n_iter=30, n_components=30, proj_back=True)

    # run iSTFT
    y = np.array([
        pra.istft(Y[:, :, ch],
                  L,
                  L,
                  transform=np.fft.irfft,
                  zp_front=L // 2,
                  zp_back=L // 2) for ch in range(Y.shape[2])
    ])

    # Compare SIR
    #############
    ref = np.moveaxis(separate_recordings, 1, 2)
    y_aligned = y[:, L // 2:ref.shape[1] + L // 2]

    mse = np.mean((ref[:, :, 0] - y_aligned)**2)
    input_variance = np.var(np.concatenate(signals))

    print('Relative MSE (expect less than 1e-5):', mse / input_variance)

    assert (mse / input_variance) < 1e-5
コード例 #26
0
# propagation filter bank
propagation_vector = -np.array([np.cos(azimuth), np.sin(azimuth)])
delays = np.dot(R.T, propagation_vector) / c * fs  # in fractional samples
filter_bank = pra.fractional_delay_filter_bank(delays)

# we use a white noise signal for the source
x = np.random.randn((nfft // 2 + 1) * nfft)

# convolve the source signal with the fractional delay filters
# to get the microphone input signals
mic_signals = [fftconvolve(x, filter, mode='same') for filter in filter_bank]
X = np.array([
    pra.stft(signal,
             nfft,
             nfft // 2,
             win=np.hanning(nfft),
             transform=np.fft.rfft).T for signal in mic_signals
])


class TestDOA(TestCase):
    def test_music(self):
        doa = pra.doa.algorithms['MUSIC'](R, fs, nfft, c=c)
        doa.locate_sources(X, freq_bins=freq_bins)
        print('distance:', pra.doa.circ_dist(azimuth, doa.azimuth_recon))
        self.assertTrue(pra.doa.circ_dist(azimuth, doa.azimuth_recon) < tol)

    def test_srp_phat(self):
        doa = pra.doa.algorithms['SRP'](R, fs, nfft, c=c)
        doa.locate_sources(X, freq_bins=freq_bins)
コード例 #27
0
def test_sparseauxiva():

    signals = [np.concatenate([wavfile.read(f)[1].astype(np.float32, order='C')
               for f in source_files])
               for source_files in wav_files]

    # Define a room environment, as well as the microphone array and source locations.
    ###########
    # Room dimensions in meters
    room_dim = [8, 9]
    # source locations and delays
    locations = [[2.5, 3], [2.5, 6]]
    delays = [1., 0.]
    # create a room with sources and mics
    room = pra.ShoeBox(room_dim, fs=16000, max_order=15, absorption=0.35, sigma2_awgn=1e-8)

    # add mic and good source to room
    # Add silent signals to all sources
    for sig, d, loc in zip(signals, delays, locations):
        room.add_source(loc, signal=np.zeros_like(sig), delay=d)

    # add microphone array
    room.add_microphone_array(pra.MicrophoneArray(np.c_[[6.5, 4.49], [6.5, 4.51]], room.fs))

    # Compute the RIRs as in the Room Impulse Response generation section.

    # compute RIRs
    room.compute_rir()

    # Record each source separately
    separate_recordings = []
    for source, signal in zip(room.sources, signals):
        source.signal[:] = signal
        room.simulate()
        separate_recordings.append(room.mic_array.signals)
        source.signal[:] = 0.
    separate_recordings = np.array(separate_recordings)

    # Mix down the recorded signals
    ###########
    mics_signals = np.sum(separate_recordings, axis=0)

    # STFT frame length
    L = 2048

    # Observation vector in the STFT domain
    X = np.array([pra.stft(ch, L, L, transform=np.fft.rfft, zp_front=L // 2, zp_back=L // 2)
                  for ch in mics_signals])
    X = np.moveaxis(X, 0, 2)

    # START BSS
    ###########
    # Estimate set of active frequency bins
    ratio = 0.35
    average = np.abs(np.mean(np.mean(X, axis=2), axis=0))
    k = np.int_(average.shape[0] * ratio)
    S = np.sort(np.argpartition(average, -k)[-k:])

    # Run SparseAuxIva
    Y = pra.bss.sparseauxiva(X, S)

    # run iSTFT
    y = np.array([pra.istft(Y[:, :, ch], L, L, transform=np.fft.irfft, zp_front=L // 2, zp_back=L // 2)
                  for ch in range(Y.shape[2])])

    # Compare SIR
    #############
    ref = np.moveaxis(separate_recordings, 1, 2)
    y_aligned = y[:,L//2:ref.shape[1]+L//2]

    mse = np.mean((ref[:,:,0] - y_aligned)**2)
    input_variance = np.var(np.concatenate(signals))

    print('Relative MSE (expect less than 1e-3):', mse / input_variance)

    assert (mse / input_variance) < 1e-3
コード例 #28
0
def multinmf_conv_mu_wrapper(x,
                             n_src,
                             n_latent_var,
                             stft_win_len,
                             partial_rirs=None,
                             W_dict=None,
                             n_iter=500,
                             l1_reg=0.,
                             random_seed=0,
                             verbose=False):
    '''
    A wrapper around multichannel nmf using MU updates to use with pyroormacoustcs.
    Performs STFT and ensures all signals are the correct shape.

    Parameters
    ----------
    x: ndarray
        (n_samples x n_channel) array of time domain samples
    n_src: int
        The number of sources
    n_latent_var: int
        The number of latent variables in the NMF
    stft_win_len:
        The length of the STFT window
    partial_rirs: array_like, optional
        (n_channel x n_src x n_bins) array of partial TF. If provided, Q is not optimized.
    W_dict: array_like, optional
        A dictionary of atoms that can be used in the NMF. If provided, W is not optimized.
    n_iter: int, optional
        The number of iterations of NMF (default 500)
    l1_reg: float, optional
        The weight of the l1 regularization term for the activations (default 0., not regularized)
    random_seed: unsigned int, optional
        The seed to provide to the RNG prior to initialization of NMF parameters. This allows to use
        repeatable initialization.
    verbose: bool, optional
        When true, prints convergence info of NMF (default False)
    '''

    n_channel = x.shape[1]

    # STFT
    window = np.sqrt(pra.cosine(stft_win_len))  # use sqrt because of synthesis
    # X is (n_channel, n_frame, n_bin)
    X = np.array([
        pra.stft(x[:, ch],
                 stft_win_len,
                 stft_win_len // 2,
                 win=window,
                 transform=np.fft.rfft) for ch in range(n_channel)
    ])
    # move axes to match Ozerov's order (n_bin, n_frame, n_channel)
    X = np.moveaxis(X, [0, 1, 2], [2, 1, 0])
    n_bin = X.shape[0]
    n_frame = X.shape[1]

    # Squared magnitude and unit energy per bin
    V = np.abs(X)**2
    V /= np.mean(V)

    # Random initialization of multichannel NMF parameters
    np.random.seed(random_seed)

    K = n_latent_var * n_src
    source_NMF_ind = []
    for j in range(n_src):
        source_NMF_ind = np.reshape(
            np.arange(n_latent_var * n_src, dtype=np.int), (n_src, -1))

    mix_psd = np.mean(V, axis=(1, 2))
    # W is intialized so that its enegy follows mixture PSD
    if W_dict is None:
        W_init = 0.5 * ((np.abs(np.random.randn(n_bin, K)) + np.ones(
            (n_bin, K))) * (mix_psd[:, np.newaxis] * np.ones((1, K))))
        fix_W = False
    else:
        if W_dict.shape[1] == n_latent_var:
            W_init = np.tile(W_dict, n_src)
        elif W_dict.shape[1] == n_src * n_latent_var:
            W_init = W_dict
        else:
            raise ValueError(
                'Mismatch between dictionary size and latent variables')
        fix_W = True

    # follow average activations
    mix_act = np.mean(V, axis=(0, 2))
    H_init = 0.5 * (np.abs(np.random.randn(K, n_frame)) + np.ones(
        (K, n_frame))) * mix_act[np.newaxis, :]

    if partial_rirs is not None:
        # squared mag partial rirs (n_bin, n_channel, n_src)
        Q_init = np.moveaxis(np.abs(partial_rirs)**2, [2], [0])
        Q_init /= np.max(Q_init, axis=0)[None, :, :]
        fix_Q = True
    else:
        # random initialization
        Q_shape = (n_bin, n_channel, n_src)
        Q_init = (0.5 * (1.9 * np.abs(np.random.randn(*Q_shape)) +
                         0.1 * np.ones(Q_shape)))**2
        fix_Q = False

    # RUN NMF
    W_MU, H_MU, Q_MU, cost = \
        multinmf_conv_mu(
                np.abs(X)**2, W_init, H_init, Q_init, source_NMF_ind,
                n_iter=n_iter, fix_Q=fix_Q, fix_W=fix_W,
                H_l1_reg=l1_reg,
                verbose=verbose)

    # Computation of the spatial source images
    Im = multinmf_recons_im(X, W_MU, H_MU, Q_MU, source_NMF_ind)

    sep_sources = []
    # Inverse STFT
    for j in range(n_src):
        # channel-wise istft with synthesis window
        ie_MU = []
        for ch in range(n_channel):
            ie_MU.append(
                pra.istft(Im[:, :, j, ch].T,
                          stft_win_len,
                          stft_win_len // 2,
                          win=window,
                          transform=np.fft.irfft))

        sep_sources.append(np.array(ie_MU).T)

    return np.array(sep_sources)
コード例 #29
0
def multinmf_conv_em_wrapper(
        x, n_src, stft_win_len, n_latent_var, n_iter=500, \
        A_init=None, W_init=None, H_init=None, \
        update_a=True, update_w=True, update_h=True, \
        verbose = False):

    '''
    A wrapper around multichannel nmf using EM updates to use with pyroormacoustcs.
    Performs STFT and ensures all signals are the correct shape.

    Parameters
    ----------
    x: ndarray
        (n_samples x n_chan) array of time domain samples
    n_latent_var: int
        number of latent variables in the NMF
    '''

    n_chan = x.shape[1]

    # STFT
    window = np.sqrt(pra.cosine(stft_win_len))  # use sqrt because of synthesis
    # X is (n_chan, n_frame, n_bin)
    X = np.array(
            [pra.stft(x[:,ch], stft_win_len, stft_win_len // 2, win=window, transform=np.fft.rfft) for ch in range(n_chan)]
            )
    # move axes to match Ozerov's order (n_bin, n_frame, n_chan)
    X = np.moveaxis(X, [0,1,2], [2,1,0])
    n_bin = X.shape[0]
    n_frame = X.shape[1]

    if W_init is None:
        K = n_latent_var * n_src
    else:
        K = W_init.shape[-1]

    # Random initialization of multichannel NMF parameters
    source_NMF_ind = []
    for j in range(n_src):
        source_NMF_ind = np.reshape(np.arange(K, dtype=np.int), (n_src,-1))

    mix_psd = 0.5 * (np.mean(np.sum(np.abs(X)**2, axis=2), axis=1))
    if A_init is None:
        # random initialization
        update_a = True
        A_init = (0.5 *
                    ( 1.9 * np.abs(random.randn(n_bin, n_chan, n_src))       \
                    + 0.1 * np.ones((n_bin, n_chan, n_src))                  \
                    ) * np.sign( random.randn(n_bin, n_chan, n_src)          \
                                + 1j * random.randn(n_bin, n_chan, n_src))  \
                )
    else:
        # reshape the partial rir input (n_bin, n_chan, n_src)
        A_init = np.moveaxis(A_init, [2], [0])

    # W is intialized so that its enegy follows mixture PSD
    if W_init is None:
        W_init = 0.5 * (
                ( np.abs(np.random.randn(n_bin,K)) + np.ones((n_bin,K)) )
                * ( mix_psd[:,np.newaxis] * np.ones((1,K)) )
                )
    if H_init is None:
        H_init = 0.5 * ( np.abs(np.random.randn(K,n_frame)) + np.ones((K,n_frame)) )

    Sigma_b_init = mix_psd / 100


    W_EM, H_EM, Ae_EM, Sigma_b_EM, Se_EM, log_like_arr = \
        multinmf_conv_em(X, W_init, H_init, A_init, Sigma_b_init, source_NMF_ind,
            iter_num=n_iter, update_a=update_a, update_w=update_w, update_h=update_h, verbose=verbose)

    Ae_EM = np.moveaxis(Ae_EM, [0], [2])

    # Computation of the spatial source images
    if verbose:
        print('Computation of the spatial source images\n')
    Ie_EM = np.zeros((n_bin,n_frame,n_src,n_chan), dtype=np.complex)
    for j in range(n_src):
        for f in range(n_bin):
            Ie_EM[f,:,j,:] = np.outer(Se_EM[f,:,j], Ae_EM[:,j,f])

    sep_sources = []

    # Inverse STFT
    ie_EM = []
    for j in range(n_src):
        # channel-wise istft with synthesis window
        ie_EM = []
        for ch in range(n_chan):
            ie_EM.append(
                    pra.istft(Ie_EM[:,:,j,ch].T, stft_win_len, stft_win_len // 2, win=window, transform=np.fft.irfft)
                    )
        sep_sources.append(np.array(ie_EM).T)

    return np.array(sep_sources)
コード例 #30
0
ファイル: fire_doa.py プロジェクト: zhouxzh/doa
c = 343
fs = 16000
nfft = 512


#Possible dos algorithms: SRP, MUSIC, TOPS, CSSM, WAVES
doa = pra.doa.algorithms['SRP'](R, fs, nfft, c=c)



plt.figure()
with MicArray(fs, 4, fs/4) as mic:
    start = time.time()
    for chunk in mic.read_chunks():
        #print(chunk.shape)
        #pixels.wakeup(np.random.randint(0, 360, 1))

        X = np.array([pra.stft(chunk[i::4], nfft, nfft//2, transform=np.fft.rfft).T for i in range(4)])
        doa.locate_sources(X, freq_range=[500, 3000])
        direction = doa.azimuth_recon / np.pi * 180
        print('Time: ', time.time()-start, ' Recovered azimuth: ', direction)
        pixels.wakeup(direction)
        #plt.close()
        #doa.polar_plt_dirac()
        #plt.draw()
        #plt.pause(0.0001)

        if is_quit.is_set():
            break

コード例 #31
0
 def _preprocessing(audio):
     X = np.array([pra.stft(signal, nfft, nfft // 2, transform=np.fft.rfft).T for signal in audio])
     return X