Ejemplo n.º 1
0
def gev_wrapper_on_masks(mix,
                         noise_mask=None,
                         target_mask=None,
                         normalization=True):
    if noise_mask is None and target_mask is None:
        raise ValueError('At least one mask needs to be present.')

    mix = mix.T
    if noise_mask is not None:
        noise_mask = noise_mask.T
    if target_mask is not None:
        target_mask = target_mask.T

    if target_mask is None:
        target_mask = np.clip(1 - noise_mask, 1e-6, 1)
    if noise_mask is None:
        noise_mask = np.clip(1 - target_mask, 1e-6, 1)
    t_psd = 0
    t_vector = 0
    t_beam = 0

    with Timer() as t:
        # print("target_mask: ", target_mask.shape, "noise_mask: ", noise_mask.shape, end="\n")
        target_psd_matrix = get_power_spectral_density_matrix(mix, target_mask)
        # print("psd matrix: ", target_psd_matrix.shape, end="\n")
        noise_psd_matrix = get_power_spectral_density_matrix(mix, noise_mask)
    t_psd += t.msecs
    # Beamforming vector

    with Timer() as t:
        W_gev = get_gev_vector(target_psd_matrix, noise_psd_matrix)
    t_vector += t.msecs

    with Timer() as t:
        if normalization:
            W_gev = blind_analytic_normalization(W_gev, noise_psd_matrix)
            # print("get normalization", end="\n")
        # print("W_gev: ", W_gev.shape, end="\n")

        output = apply_beamforming_vector(W_gev, mix)
    t_beam += t.msecs
    print(
        'Timings: PSD: {:.2f}s | Vector: {:.2f}s | Beam: {:.2f}s | Total: {:.2f}s'
        .format(t_psd / 1000, t_vector / 1000, t_beam / 1000,
                ((t_psd + t_vector + t_beam) / 1000)))
    # beamformer_psd_matrix = get_power_spectral_density_matrix(output)
    # print(beamformer_psd_matrix)
    # print ("no normalization")

    return output.T
Ejemplo n.º 2
0
def single_normal():
    # audio_data = get_audio_nochime('data/new_dataset/216m/2m_pub_new', ch_range=range(1, 9), fs=16000)
    # noise_data = get_audio_nochime('data/new_dataset/blstm_noise/noise_124', ch_range=range(1, 9), fs=16000)
    # audio_data = get_audio_nochime(args.data_directory, ch_range=range(1, 3), fs=16000)
    t_io = 0
    t_net = 0
    t_beamform = 0

    # check execution time
    with Timer() as t:
        audio_data = get_audio_nochime(args.data_directory,
                                       ch_range=range(1, 3),
                                       fs=16000)
        context_samples = 0
        print("audio_data: ", audio_data.shape, end="\n")
        # for i in range (0, 8):
        #     print(audio_data[i][1])
    t_io += t.msecs

    Y = stft(audio_data, time_dim=1).transpose((1, 0, 2))
    # N = stft(noise_data, time_dim=1).transpose((1, 0, 2))

    Y_phase = np.divide(Y, abs(Y))
    print("Y: ", Y.shape, "Y_phase: ", Y_phase.shape, end="\n")
    # Y_var with or without chainer Variable class doesn't give any different
    Y_var = Variable(np.abs(Y).astype(np.float32))

    # N_var = Variable(np.abs(N).astype(np.float32), True)
    # blstm_noise = Variable(np.abs(blstm_noise).astype(np.float32), True)

    with Timer() as t:
        # mask estimation
        N_masks, X_masks = model.calc_masks(Y_var)
        # Noise_masks = model.calc_mask_noise(N_var)
        print("N_masks: ", N_masks.shape, end="\n")
        N_masks.to_cpu()
        X_masks.to_cpu()
    t_net += t.msecs
    # Noise_masks.to_cpu()

    with Timer() as t:
        N_mask = np.median(N_masks.data, axis=1)
        X_mask = np.median(X_masks.data, axis=1)

        # Noise_mask = np.median(Noise_masks.data, axis=1)

        # signal = audioread('data/new_dataset/216m/2m_pub_new' + '.CH{}.wav'.format(ch), sample_rate=16000)
        # noise = audioread('data/new_dataset/gevnoise/gevnoise' + '.CH{}.wav'.format(ch), sample_rate=16000)
        # signal_ = stft(signal)
        # noise_ = stft(noise)
        #
        # signal_phase = np.divide(signal, abs(signal_))
        # noise_masks = model.calc_mask_noise(noise_)
        # noise_to = np.multiply(noise_masks.data, signal_)
        # noise_to = np.multiply(noise_to, signal_phase)
        # audiowrite(istft(noise_to)[context_samples:],
        #            "/home/hipo/workspace/BeamSaber/result/noise/noise_to_.CH{}.wav".format(ch), 16000, True, True)

        Noise = np.multiply(N_masks.data, Y)
        Noise = np.multiply(Noise, Y_phase)
        # Y_phase_med = np.median(Y_phase, axis=1)
        # print(Noise.shape)
        # for ch in range(0, 8):
        #     audiowrite(istft(Noise[:,ch,:])[context_samples:],
        #                "/home/hipo/workspace/BeamSaber/result/noise/2mnoise_.CH{}.wav".format(ch), 16000, True, True)
        Noise = np.median(Noise, axis=1)

        # print("N_mask: ", N_mask.shape, "X_mask: ", X_mask.shape, "Y_phase: ", Y_phase.shape, end="\n")
        Y_hat = gev_wrapper_on_masks(Y, N_mask, X_mask)
        # print(Y_hat.shape)
        # print("Noise: ", Noise.shape)
    t_beamform += t.msecs

    with Timer() as t:
        audiowrite(
            istft(Noise)[context_samples:],
            "/media/hipo/lento/workspace/BeamSaber/tools/enhancement/gev/PublicFOMLSA/sample/{}_noise.wav"
            .format(args.exNum), 16000, True, True)
        audiowrite(
            istft(Y_hat)[context_samples:],
            "/media/hipo/lento/workspace/BeamSaber/tools/enhancement/gev/PublicFOMLSA/sample/{}_gev.wav"
            .format(args.exNum), 16000, True, True)
    t_io += t.msecs
    print(
        'Timings: I/O: {:.2f}s | Net: {:.2f}s | Beamformer: {:.2f}s | Total: {:.2f}s'
        .format(t_io / 1000, t_net / 1000, t_beamform / 1000,
                ((t_io + t_net + t_beamform) / 1000)))
Ejemplo n.º 3
0
    flist = gen_flist_real(args.chime_dir, stage)
else:
    raise ValueError('Unknown flist {}'.format(args.flist))

for env in ['caf', 'bus', 'str', 'ped']:
    for beamformer in beamformers:
        mkdir_p(os.path.join(args.output_dir, beamformer,'{}05_{}_{}'.format(
                stage, env, scenario
        )))

t_io = 0
t_net = 0
t_beamform = 0
# Beamform loop
for cur_line in tqdm(flist):
    with Timer() as t:
        if scenario == 'simu':
            audio_data = get_audio_data(cur_line)
            context_samples = 0
        elif scenario == 'real':
            audio_data, context_samples = get_audio_data_with_context(
                    cur_line[0], cur_line[1], cur_line[2])
    t_io += t.msecs
    Y = stft(audio_data, time_dim=1).transpose((1, 0, 2))
    Y_var = Variable(np.abs(Y).astype(np.float32))
    if args.gpu >= 0:
        Y_var.to_gpu(args.gpu)
    with Timer() as t:
        N_masks, X_masks = model.calc_masks(Y_var)
        N_masks.to_cpu()
        X_masks.to_cpu()