Esempio n. 1
0
        noisy_spectrogram[:, TRUNCATE_GRAD // 2:-TRUNCATE_GRAD // 2]**2) * 10),
              aspect='auto')
    pl.subplot(3, 1, 2)
    pl.imshow(((n_median_s.T)), aspect="auto")
    pl.title('noise mask')
    pl.subplot(3, 1, 3)
    pl.imshow(((sp_median_s.T)), aspect="auto")
    pl.title('speech mask')
    pl.show()

#==========================================
# beamforming
#==========================================

# sinple MVDR
cgmm_bf = cgmm.complexGMM_mvdr(SAMPLING_FREQUENCY, FFTL, SHIFT, 10, 10)
tmp_complex_spectrum, R_x, R_n, tt, nn = cgmm_bf.get_spatial_correlation_matrix_from_mask_for_LSTM(
    dump_speech,
    speech_mask=sp_median_s.T,
    noise_mask=n_median_s.T,
    less_frame=3)
beamformer, steering_vector = cgmm_bf.get_mvdr_beamformer(R_x, R_n)
enhan_speech = cgmm_bf.apply_beamformer(beamformer, tmp_complex_spectrum)

# reference mic selection MVDR
cgmm_bf_snr = cgmm_snr.complexGMM_mvdr(SAMPLING_FREQUENCY, FFTL, SHIFT, 10, 10)

tmp_complex_spectrum, R_x, R_n, tt, nn = cgmm_bf_snr.get_spatial_correlation_matrix_from_mask_for_LSTM(
    dump_speech,
    speech_mask=sp_median_s.T,
    noise_mask=n_median_s.T,
Esempio n. 2
0
MIN_SEGMENT_DUR = 2
ENHANCED_WAV_NAME = './output/enhanced_speech_cgmm.wav'
IS_MASK_PLOT = True

def multi_channel_read(prefix=r'./sample_data/20G_20GO010I_STR.CH{}.wav',
                       channel_index_vector=np.array([1, 2, 3, 4, 5, 6])):
    wav, _ = sf.read(prefix.replace('{}', str(channel_index_vector[0])), dtype='float32')
    wav_multi = np.zeros((len(wav), len(channel_index_vector)), dtype=np.float32)
    wav_multi[:, 0] = wav
    for i in range(1, len(channel_index_vector)):
        wav_multi[:, i] = sf.read(prefix.replace('{}', str(channel_index_vector[i])), dtype='float32')[0]
    return wav_multi

multi_channels_data = multi_channel_read()

cgmm_beamformer = cgmm.complexGMM_mvdr(SAMPLING_FREQUENCY, FFT_LENGTH, FFT_SHIFT, NUMBER_EM_ITERATION, MIN_SEGMENT_DUR)

complex_spectrum, R_x, R_n, noise_mask, speech_mask = cgmm_beamformer.get_spatial_correlation_matrix(multi_channels_data)

beamformer, steering_vector = cgmm_beamformer.get_mvdr_beamformer(R_x, R_n)

enhanced_speech = cgmm_beamformer.apply_beamformer(beamformer, complex_spectrum)

sf.write(ENHANCED_WAV_NAME, enhanced_speech / np.max(np.abs(enhanced_speech)) * 0.65, SAMPLING_FREQUENCY)

if IS_MASK_PLOT:
    pl.figure()
    pl.subplot(2, 1, 1)
    pl.imshow(np.real(noise_mask).T, aspect='auto', origin='lower', cmap='hot')
    pl.title('noise mask')
    pl.subplot(2, 1, 2)
def do_cgmm_mvdr(audio, outpath, outname):
    """
    Doing the cgmm_mvdr algorithm
    :return: no return
    """
    oo = MyTimer()
    oo.start("all")
    oo.start("init")
    cgmm_beamformer = cgmm.complexGMM_mvdr(SAMPLING_FREQUENCY, FFT_LENGTH, FFT_SHIFT, NUMBER_EM_ITERATION,
                                           MIN_SEGMENT_DUR)

    complex_spectrum_audio, _ = util.get_3dim_spectrum_from_data(audio,
                                                                 cgmm_beamformer.fft_length,
                                                                 cgmm_beamformer.fft_shift,
                                                                 cgmm_beamformer.fft_length)
    number_of_channels, number_of_frames, number_of_bins = np.shape(complex_spectrum_audio)

    R_noise = np.zeros((number_of_channels, number_of_channels, number_of_bins), dtype=np.complex64)
    R_noisy = np.zeros((number_of_channels, number_of_channels, number_of_bins), dtype=np.complex64)

    for f in range(0, number_of_bins):
        for t in range(0, number_of_frames):
            h = np.multiply.outer(complex_spectrum_audio[:, t, f], np.conj(complex_spectrum_audio[:, t, f]).T)
            R_noisy[:, :, f] = R_noisy[:, :, f] + h
        R_noisy[:, :, f] = R_noisy[:, :, f] / number_of_frames
        R_noise[:, :, f] = np.eye(number_of_channels, number_of_channels, dtype=np.complex64)
    R_xn = copy.deepcopy(R_noisy)
    del complex_spectrum_audio, audio, _
    gc.collect()
    oo.stopPrint("init")
    oo.start("em")
    R_n = np.zeros((number_of_channels, number_of_channels, number_of_bins), dtype=np.complex64)
    gc.disable()
    for i in range(len(multi_channels_data)):
    # for i in range(1):
        try:
            oo.start("chunk " + str(i + 1))

            os.system("echo ---- chunk " + str(i + 1) + ' ----')
            os.system("echo " + str(input_data_list[i]))
            R_noise, R_noisy, R_n = cgmm_beamformer.get_spatial_correlation_matrix(
                multi_channels_data[i], R_noise, R_noisy, R_n)
            oo.stopPrint("chunk " + str(i + 1))
        except Exception:
            continue

    oo.stopPrint("em")
    gc.enable()

    oo.start("mask")
    R_x = R_xn - R_n
    oo.stopPrint("mask")
    os.system("echo mask estimation done")

    oo.start("bmf")
    beamformer, steering_vector = cgmm_beamformer.get_mvdr_beamformer(R_x, R_n)
    os.system("echo bmf done")
    audio = multi_channel_read(inputli, '')
    complex_spectrum_audio, _ = util.get_3dim_spectrum_from_data(audio,
                                                                 cgmm_beamformer.fft_length,
                                                                 cgmm_beamformer.fft_shift,
                                                                 cgmm_beamformer.fft_length)

    enhanced_speech = cgmm_beamformer.apply_beamformer(beamformer, complex_spectrum_audio)
    os.system("echo enhan done")

    wf.write(outpath + '/' + outname,
                 SAMPLING_FREQUENCY, enhanced_speech / np.max(np.abs(enhanced_speech)) * 0.65)
    os.system("echo all done")
    oo.stopPrint("bmf")
    oo.stopPrint("all")