Esempio n. 1
0
def apply_beamfomer(args):
    estimator = MaskEstimator(num_bins)
    mask_computer = MaskComputer(estimator, args.model)

    flist_name = os.path.basename(args.flist)
    sub_dir = flist_name.split('.')[0]
    dumps_dir = os.path.join(args.dumps_dir, sub_dir)

    func_bf = mvdr_wrapper_on_masks if not args.gev else \
            gev_wrapper_on_masks

    if not os.path.exists(dumps_dir):
        os.makedirs(dumps_dir)

    with open(args.flist, 'r') as f:
        flist = f.readlines()

    for f in flist:
        f = f.strip()
        tokens = f.split('/')
        noisy_samples = load_multichannel_data(f)
        noisy_specs = stft(noisy_samples, time_dim=1).transpose((1, 0, 2))
        mask_n, mask_x = mask_computer.compute_masks(
            np.abs(noisy_specs).astype(np.float32))
        mask_n = np.median(mask_n, axis=1)
        mask_x = np.median(mask_x, axis=1)
        clean_specs = func_bf(noisy_specs, mask_n, mask_x)
        clean_samples = istft(clean_specs)
        print('dumps to {}/{}.wav'.format(dumps_dir, tokens[-1]))
        audiowrite(clean_samples, '{}/{}.wav'.format(dumps_dir, tokens[-1]),
                   16000, True, True)
Esempio n. 2
0
def audio_manipulation(self):
    print("longto")

    audio_file = audioread('new_dataset/chime_ex.wav', sample_rate=16000)
    babble_file = audioread('new_dataset/babble_16.wav', sample_rate=16000)

    print("len chime: ", audio_file.shape)
    print("len chime: ", babble_file.shape)

    audio_shape = audio_file.shape[0]
    babble_shape = babble_file.shape[0]
    split = int(babble_shape / audio_shape)
    # y = list()
    start = 0
    end = audio_file.shape[0]
    for i in range(1, 7):
        print("start = ", start, "end = ", end)
        y = babble_file[start:end]
        start = end + 1
        end = end + audio_file.shape[0]
        audiowrite(y, "new_dataset/babble_noise/babble.CH{}.wav".format(i))

    # audiowrite(y, "y.wav")
    # np.split(babble_file, 2)

    print("split into: ", split, "babble shape: ", babble_file.shape, "y: ",
          sys.getsizeof(y))

    audio_stft = stft(audio_file)
    babble_stft = stft(y)
    print(audio_stft.shape)
    print(babble_stft.shape)
Esempio n. 3
0
def prepare_custom_audio(noise_data, chime_data):
    print("new shape: ", chime_data.shape)
    # noise_data = audioread('new_dataset/babble.wav', sample_rate=16000)
    start = 0
    end = chime_data.shape[0]
    for i in range(1, 7):
        y = noise_data[start:end]
        print("start: ", start, "end: ", end, end="\n")
        start = end
        end = end + chime_data.shape[0]
        audiowrite(y, "new_dataset/babble_noise/babble.CH{}.wav".format(i))
    sleep(0.01)
    print("last_shape: ", chime_data.shape)
Esempio n. 4
0
def apply_beamfomer(args):
    estimator = MaskEstimator(num_bins)
    mask_computer = MaskComputer(estimator, args.model)

    func_bf = mvdr_wrapper_on_masks if not args.gev else \
            gev_wrapper_on_masks

    f = args.flist.strip()
    tokens = f.split('/')
    noisy_samples = load_multichannel_data(args.flist)
    noisy_specs = stft(noisy_samples, time_dim=1).transpose((1, 0, 2))
    mask_n, mask_x = mask_computer.compute_masks(
        np.abs(noisy_specs).astype(np.float32))
    mask_n = np.median(mask_n, axis=1)
    mask_x = np.median(mask_x, axis=1)
    clean_specs = func_bf(noisy_specs, mask_n, mask_x)
    clean_samples = istft(clean_specs)
    #print('dumps to {}/{}.wav'.format(args.dump, tokens[-1]))
    audiowrite(clean_samples, '{}/{}'.format(args.dump, tokens[-1]), 16000,
               True, True)
Esempio n. 5
0
def write_wav(magnitude,
              phase,
              filename,
              exponentiate=True,
              griffin_lim=False):
    if exponentiate:
        magnitude = np.exp(magnitude)

    complex_spec = magnitude * np.exp(1j * phase)

    kwargs = {'size': 512, 'shift': 64, 'window_length': 512}
    resynth = istft(complex_spec, **kwargs)

    if griffin_lim:
        for i in range(10):
            complex_spec = magnitude * np.exp(
                1j * np.angle(stft(resynth, **kwargs)))
            resynth = istft(complex_spec, **kwargs)

    audiowrite(resynth, filename)
Esempio n. 6
0
def audio_joiner(path):
    chime_data_dir = path
    print(path)
    flist = [
        f for f in listdir(chime_data_dir) if isfile(join(chime_data_dir, f))
    ]
    thefile = open('list.txt', 'w')
    y = list()
    counter = 0
    for item in flist:
        audio_file = audioread('{}/{}'.format(path, item), sample_rate=16000)
        print(item)
        if len(audio_file) < len(y):
            c = y.copy()
            c[:len(audio_file)] += audio_file
        else:
            c = audio_file.copy()
            c[:len(y)] += y

            # y = y + audio_file

    audiowrite(c,
               '/media/hipo/lento/Dataset/LibriSpeech/test/com.flac',
               samplerate=16000)
Esempio n. 7
0
def single_normal():
    # audio_data = get_audio_nochime('data/new_dataset/216m/2m_pub_new', ch_range=range(1, 9), fs=16000)
    # noise_data = get_audio_nochime('data/new_dataset/blstm_noise/noise_124', ch_range=range(1, 9), fs=16000)
    # audio_data = get_audio_nochime(args.data_directory, ch_range=range(1, 3), fs=16000)
    t_io = 0
    t_net = 0
    t_beamform = 0

    # check execution time
    with Timer() as t:
        audio_data = get_audio_nochime(args.data_directory,
                                       ch_range=range(1, 3),
                                       fs=16000)
        context_samples = 0
        print("audio_data: ", audio_data.shape, end="\n")
        # for i in range (0, 8):
        #     print(audio_data[i][1])
    t_io += t.msecs

    Y = stft(audio_data, time_dim=1).transpose((1, 0, 2))
    # N = stft(noise_data, time_dim=1).transpose((1, 0, 2))

    Y_phase = np.divide(Y, abs(Y))
    print("Y: ", Y.shape, "Y_phase: ", Y_phase.shape, end="\n")
    # Y_var with or without chainer Variable class doesn't give any different
    Y_var = Variable(np.abs(Y).astype(np.float32))

    # N_var = Variable(np.abs(N).astype(np.float32), True)
    # blstm_noise = Variable(np.abs(blstm_noise).astype(np.float32), True)

    with Timer() as t:
        # mask estimation
        N_masks, X_masks = model.calc_masks(Y_var)
        # Noise_masks = model.calc_mask_noise(N_var)
        print("N_masks: ", N_masks.shape, end="\n")
        N_masks.to_cpu()
        X_masks.to_cpu()
    t_net += t.msecs
    # Noise_masks.to_cpu()

    with Timer() as t:
        N_mask = np.median(N_masks.data, axis=1)
        X_mask = np.median(X_masks.data, axis=1)

        # Noise_mask = np.median(Noise_masks.data, axis=1)

        # signal = audioread('data/new_dataset/216m/2m_pub_new' + '.CH{}.wav'.format(ch), sample_rate=16000)
        # noise = audioread('data/new_dataset/gevnoise/gevnoise' + '.CH{}.wav'.format(ch), sample_rate=16000)
        # signal_ = stft(signal)
        # noise_ = stft(noise)
        #
        # signal_phase = np.divide(signal, abs(signal_))
        # noise_masks = model.calc_mask_noise(noise_)
        # noise_to = np.multiply(noise_masks.data, signal_)
        # noise_to = np.multiply(noise_to, signal_phase)
        # audiowrite(istft(noise_to)[context_samples:],
        #            "/home/hipo/workspace/BeamSaber/result/noise/noise_to_.CH{}.wav".format(ch), 16000, True, True)

        Noise = np.multiply(N_masks.data, Y)
        Noise = np.multiply(Noise, Y_phase)
        # Y_phase_med = np.median(Y_phase, axis=1)
        # print(Noise.shape)
        # for ch in range(0, 8):
        #     audiowrite(istft(Noise[:,ch,:])[context_samples:],
        #                "/home/hipo/workspace/BeamSaber/result/noise/2mnoise_.CH{}.wav".format(ch), 16000, True, True)
        Noise = np.median(Noise, axis=1)

        # print("N_mask: ", N_mask.shape, "X_mask: ", X_mask.shape, "Y_phase: ", Y_phase.shape, end="\n")
        Y_hat = gev_wrapper_on_masks(Y, N_mask, X_mask)
        # print(Y_hat.shape)
        # print("Noise: ", Noise.shape)
    t_beamform += t.msecs

    with Timer() as t:
        audiowrite(
            istft(Noise)[context_samples:],
            "/media/hipo/lento/workspace/BeamSaber/tools/enhancement/gev/PublicFOMLSA/sample/{}_noise.wav"
            .format(args.exNum), 16000, True, True)
        audiowrite(
            istft(Y_hat)[context_samples:],
            "/media/hipo/lento/workspace/BeamSaber/tools/enhancement/gev/PublicFOMLSA/sample/{}_gev.wav"
            .format(args.exNum), 16000, True, True)
    t_io += t.msecs
    print(
        'Timings: I/O: {:.2f}s | Net: {:.2f}s | Beamformer: {:.2f}s | Total: {:.2f}s'
        .format(t_io / 1000, t_net / 1000, t_beamform / 1000,
                ((t_io + t_net + t_beamform) / 1000)))
Esempio n. 8
0
    with Timer() as t:
        N_mask = np.median(N_masks.data, axis=1)
        X_mask = np.median(X_masks.data, axis=1)
        Y_hat_dicts = bf_wrapper_on_masks(Y, N_mask, X_mask, beamformers=beamformers)
    t_beamform += t.msecs

    if scenario == 'simu':
        wsj_name = cur_line.split('/')[-1].split('_')[1]
        spk = cur_line.split('/')[-1].split('_')[0]
        env = cur_line.split('/')[-1].split('_')[-1]
    elif scenario == 'real':
        wsj_name = cur_line[3]
        spk = cur_line[0].split('/')[-1].split('_')[0]
        env = cur_line[0].split('/')[-1].split('_')[-1]

    for beamformer, Y_hat in Y_hat_dicts.items():
        filename = os.path.join(
                args.output_dir,beamformer,
                '{}05_{}_{}'.format(stage, env.lower(), scenario),
                '{}_{}_{}.wav'.format(spk, wsj_name, env.upper())
        )
        with Timer() as t:
            audiowrite(istft(Y_hat)[context_samples:], filename, 16000, True, True)
        t_io += t.msecs

print('Finished')
print('Timings: I/O: {:.2f}s | Net: {:.2f}s | Beamformer: {:.2f}s'.format(
        t_io / 1000, t_net / 1000, t_beamform / 1000
))
Esempio n. 9
0
    NN_masks.to_cpu()
    XX_masks.to_cpu()

with Timer() as t:
    NN_mask = np.median(NN_masks.data, axis=1)
    XX_mask = np.median(XX_masks.data, axis=1)
    print("Y: ",
          Y_hat.shape,
          "N_mask: ",
          NN_mask.shape,
          "X_mask: ",
          XX_mask.shape,
          end="\n")
    # try:
    YY_hat = gev_wrapper_on_masks(Y_hat, NN_mask, XX_mask)
    # except AttributeError:
    #     YY_hat = gev_wrapper_on_masks(Y, NN_mask, XX_mask)

with Timer() as t:
    audiowrite(
        istft(YY_hat),
        "new_dataset_result/2m_feedback_{}.wav".format(args.experiments),
        48000, True, True)
t_io += t.msecs

print('Finished')
print(
    'Timings: I/O: {:.2f}s | Net: {:.2f}s | Beamformer: {:.2f}s | Total Time: {:.2f}s'
    .format(t_io / 1000, t_net / 1000, t_beamform / 1000,
            ((t_io / 1000) + (t_net / 1000) + (t_beamform / 1000))))
Esempio n. 10
0
        X_masks.to_cpu()
    t_net += t.msecs

    with Timer() as t:
        N_mask = np.median(N_masks.data, axis=1)
        X_mask = np.median(X_masks.data, axis=1)
        Y_hat = gev_wrapper_on_masks(Y, N_mask, X_mask)
    t_beamform += t.msecs

    if scenario == 'simu':
        wsj_name = cur_line.split('/')[-1].split('_')[1]
        spk = cur_line.split('/')[-1].split('_')[0]
        env = cur_line.split('/')[-1].split('_')[-1]
    elif scenario == 'real':
        wsj_name = cur_line[3]
        spk = cur_line[0].split('/')[-1].split('_')[0]
        env = cur_line[0].split('/')[-1].split('_')[-1]

    filename = os.path.join(args.output_dir,
                            '{}05_{}_{}'.format(stage, env.lower(), scenario),
                            '{}_{}_{}.wav'.format(spk, wsj_name, env.upper()))
    with Timer() as t:
        audiowrite(
            istft(Y_hat, audio_data.shape[1])[context_samples:], filename,
            16000, True, True)
    t_io += t.msecs

print('Finished')
print('Timings: I/O: {:.2f}s | Net: {:.2f}s | Beamformer: {:.2f}s'.format(
    t_io / 1000, t_net / 1000, t_beamform / 1000))
Esempio n. 11
0
    t_net += t.msecs

    with Timer() as t:
        N_mask = np.median(N_masks.data, axis=1)
        X_mask = np.median(X_masks.data, axis=1)
        Y_hat = gev_wrapper_on_masks(Y, N_mask, X_mask)
    t_beamform += t.msecs

    if scenario == 'simu':
        wsj_name = cur_line.split('/')[-1].split('_')[1]
        spk = cur_line.split('/')[-1].split('_')[0]
        env = cur_line.split('/')[-1].split('_')[-1]
    elif scenario == 'real':
        wsj_name = cur_line[3]
        spk = cur_line[0].split('/')[-1].split('_')[0]
        env = cur_line[0].split('/')[-1].split('_')[-1]

    filename = os.path.join(
            args.output_dir,
            '{}05_{}_{}'.format(stage, env.lower(), scenario),
            '{}_{}_{}.wav'.format(spk, wsj_name, env.upper())
    )
    with Timer() as t:
        audiowrite(istft(Y_hat)[context_samples:], filename, 16000, True, True)
    t_io += t.msecs

print('Finished')
print('Timings: I/O: {:.2f}s | Net: {:.2f}s | Beamformer: {:.2f}s'.format(
        t_io / 1000, t_net / 1000, t_beamform / 1000
))
Esempio n. 12
0
        #Y_hat = mcmf_wrapper_on_masks(Y, N_mask, X_mask, output_setup, corr_info)
    t_beamform += t.msecs

    # the spliter in Win '\' and Linux '/'
    if scenario == 'simu':
        wsj_name = cur_line.split('\\')[-1].split('_')[1]
        spk = cur_line.split('\\')[-1].split('_')[0]
        env = cur_line.split('\\')[-1].split('_')[-1]
    elif scenario == 'real':
        wsj_name = cur_line[3]
        spk = cur_line[0].split('\\')[-1].split('_')[0]
        env = cur_line[0].split('\\')[-1].split('_')[-1]

    filename = os.path.join(args.output_dir,
                            '{}05_{}_{}'.format(stage, env.lower(), scenario),
                            '{}_{}_{}.wav'.format(spk, wsj_name, env.upper()))

    audiowrite(
        istft(Y_hat, audio_data.shape[1])[context_samples:],
        filename[:-4] + outfile_postfix + '.wav', 16000, True, True)
'''
    # direct apply the mask on the DS
    Y_ds_hat = np.sum(Y, axis=1) * X_mask
    audiowrite(istft(Y_ds_hat, audio_data.shape[1])[context_samples:], filename[:-4]+'_X.wav', 16000, True, True)
    Y_ds_hat = np.sum(Y, axis=1) * X_mask/(X_mask+N_mask)
    audiowrite(istft(Y_ds_hat, audio_data.shape[1])[context_samples:], filename[:-4]+'_W.wav', 16000, True, True)
'''
print('Finished')
print('Timings: I/O: {:.2f}s | Net: {:.2f}s | Beamformer: {:.2f}s'.format(
    t_io / 1000, t_net / 1000, t_beamform / 1000))
        if scenario == 'simu' or args.track == 2:
            wsj_name = cur_line.split('/')[-1].split('_')[1]
            spk = cur_line.split('/')[-1].split('_')[0]
            env = cur_line.split('/')[-1].split('_')[-1]
        elif scenario == 'real':
            wsj_name = cur_line[3]
            spk = cur_line[0].split('/')[-1].split('_')[0]
            env = cur_line[0].split('/')[-1].split('_')[-1]

        filename = os.path.join(
            args.output_dir, '{}05_{}_{}'.format(stage, env.lower(), scenario),
            '{}_{}_{}.wav'.format(spk, wsj_name, env.upper()))
    if args.track == 1:
        with Timer() as t:
            audiowrite(
                istft(Y_hat[:, 0, :])[int(context_samples):], filename, 16000,
                True, True)
        t_io += t.msecs
    elif args.single == 0:
        with Timer() as t:
            audiowrite(
                istft(Y_hat)[int(context_samples):], filename, 16000, True,
                True)
        t_io += t.msecs
    elif args.single >= 1:
        ch = args.single
        with Timer() as t:
            audiowrite(
                istft(Y_hat[:, ch - 1, :])[int(context_samples):], filename,
                16000, True, True)
        t_io += t.msecs