Beispiel #1
0
def prepare_training_data(chime_data_dir, dest_dir):
    for stage in ['tr', 'dt']:
        flist = gen_flist_simu(chime_data_dir, stage, ext=True)
        export_flist = list()
        # mkdir_p(os.path.join(dest_dir, stage))
        for idx in range(len(flist)):
            f = flist[idx]
            clean_audio = get_audio_data(f, '.Clean')
            noise_audio = get_audio_data(f, '.Noise')
            X = stft(clean_audio, time_dim=1, size=256, shift=128).transpose(
                (1, 0, 2))
            N = stft(noise_audio, time_dim=1, size=256, shift=128).transpose(
                (1, 0, 2))
            IBM_X, IBM_N = estimate_IBM(X, N)
            Y_abs = np.abs(X + N)
            export_dict = {
                'IBM_X': IBM_X.astype(np.float32),
                'IBM_N': IBM_N.astype(np.float32),
                'Y_abs': Y_abs.astype(np.float32)
            }
            export_name = os.path.join(dest_dir, stage, f.split('/')[-1])
            with open(export_name, 'wb') as fid:
                pickle.dump(export_dict, fid)
            export_flist.append(os.path.join(stage, f.split('/')[-1]))
        with open(os.path.join(dest_dir, 'flist_{}.json'.format(stage)),
                  'w') as fid:
            json.dump(export_flist, fid, indent=4)
Beispiel #2
0
def prepare_training_data(chime_data_dir, dest_dir, suffix_id):
    for stage in ['tr', 'dt']:
        fpath, flist = gen_flist_simu(chime_data_dir, stage, suffix_id)
        export_flist = list()
        mkdir_p(os.path.join(dest_dir, stage))
        for f in tqdm.tqdm(flist, desc='Generating data for {}'.format(fpath)):
            clean_audio = get_audio_data(f, '_clean')
            noise_audio = get_audio_data(f, '_noise')
            X = stft(clean_audio, time_dim=1).transpose((1, 0, 2))
            N = stft(noise_audio, time_dim=1).transpose((1, 0, 2))
            IBM_X, IBM_N = estimate_IBM(X, N)
            Y_abs = np.abs(X + N)
            export_dict = {
                'IBM_X': IBM_X.astype(np.float32),
                'IBM_N': IBM_N.astype(np.float32),
                'Y_abs': Y_abs.astype(np.float32)
            }
            export_name = os.path.join(dest_dir, stage, f.split('/')[-1])
            with open(export_name, 'wb') as fid:
                pickle.dump(export_dict, fid)
            export_flist.append(os.path.join(stage, f.split('/')[-1]))
        with open(
                os.path.join(dest_dir,
                             'flist_{}_{}.json'.format(stage, suffix_id)),
                'w') as fid:
            json.dump(export_flist, fid, indent=4)
Beispiel #3
0
def audio_manipulation(self):
    print("longto")

    audio_file = audioread('new_dataset/chime_ex.wav', sample_rate=16000)
    babble_file = audioread('new_dataset/babble_16.wav', sample_rate=16000)

    print("len chime: ", audio_file.shape)
    print("len chime: ", babble_file.shape)

    audio_shape = audio_file.shape[0]
    babble_shape = babble_file.shape[0]
    split = int(babble_shape / audio_shape)
    # y = list()
    start = 0
    end = audio_file.shape[0]
    for i in range(1, 7):
        print("start = ", start, "end = ", end)
        y = babble_file[start:end]
        start = end + 1
        end = end + audio_file.shape[0]
        audiowrite(y, "new_dataset/babble_noise/babble.CH{}.wav".format(i))

    # audiowrite(y, "y.wav")
    # np.split(babble_file, 2)

    print("split into: ", split, "babble shape: ", babble_file.shape, "y: ",
          sys.getsizeof(y))

    audio_stft = stft(audio_file)
    babble_stft = stft(y)
    print(audio_stft.shape)
    print(babble_stft.shape)
def apply_beamfomer(args):
    estimator = MaskEstimator(num_bins)
    mask_computer = MaskComputer(estimator, args.model)

    flist_name = os.path.basename(args.flist)
    sub_dir = flist_name.split('.')[0]
    dumps_dir = os.path.join(args.dumps_dir, sub_dir)

    func_bf = mvdr_wrapper_on_masks if not args.gev else \
            gev_wrapper_on_masks

    if not os.path.exists(dumps_dir):
        os.makedirs(dumps_dir)

    with open(args.flist, 'r') as f:
        flist = f.readlines()

    for f in flist:
        f = f.strip()
        tokens = f.split('/')
        noisy_samples = load_multichannel_data(f)
        noisy_specs = stft(noisy_samples, time_dim=1).transpose((1, 0, 2))
        mask_n, mask_x = mask_computer.compute_masks(
            np.abs(noisy_specs).astype(np.float32))
        mask_n = np.median(mask_n, axis=1)
        mask_x = np.median(mask_x, axis=1)
        clean_specs = func_bf(noisy_specs, mask_n, mask_x)
        clean_samples = istft(clean_specs)
        print('dumps to {}/{}.wav'.format(dumps_dir, tokens[-1]))
        audiowrite(clean_samples, '{}/{}.wav'.format(dumps_dir, tokens[-1]),
                   16000, True, True)
Beispiel #5
0
def prepare_clean_training_data(chime_data_dir, dest_dir):
    start = 0
    # print("sdsd")
    for stage in ['tr', 'dt']:
        reset_counter = 0
        flist = gen_flist_simu(chime_data_dir, stage, ext=True)
        # print(flist)
        export_flist = list()
        mkdir_p(os.path.join(dest_dir, stage))
        clean_data = audioread('/media/hipo/Mega Store/Dataset/single file/Chinese_tai_clean.wav')
        print("clean_data size:", clean_data.shape[0])
        for f in tqdm.tqdm(flist, desc='Generating data for {}'.format(stage)):
            # clean_audio = get_audio_data(f, '.Clean')
            noise_audio = get_audio_data(f, '.Noise')
            # print(chime_data_dir)
            chime_size = audioread('{}.CH{}{}.Noise.wav'.format(f, 1, ''))
            clean_files = list()
            end = chime_size.shape[0] + start
            if end > clean_data.shape[0]:
                print("reset counter: ", reset_counter + 1)
                start = 0
                end = chime_size.shape[0] + start
            for i in range(1, 7):
                y = clean_data[start:end]
            start = end
            clean_files.append(y[None, :])
            clean_files = np.concatenate(clean_files, axis=0)
            clean_files = clean_files.astype(np.float32)
            clean_audio = clean_files

            X = stft(clean_audio, time_dim=1).transpose((1, 0, 2))
            N = stft(noise_audio, time_dim=1).transpose((1, 0, 2))

            IBM_X, IBM_N = estimate_IBM(X, N)
            Y_abs = np.abs(X + N)
            export_dict = {
                'IBM_X': IBM_X.astype(np.float32),
                'IBM_N': IBM_N.astype(np.float32),
                'Y_abs': Y_abs.astype(np.float32)
            }
            export_name = os.path.join(dest_dir, stage, f.split('/')[-1])
            with open(export_name, 'wb') as fid:
                pickle.dump(export_dict, fid)
            export_flist.append(os.path.join(stage, f.split('/')[-1]))
        with open(os.path.join(dest_dir, 'flist_{}.json'.format(stage)),
                  'w') as fid:
            json.dump(export_flist, fid, indent=4)
Beispiel #6
0
def prepare_training_data(chime_data_dir, dest_dir):
    for stage in ['tr', 'dt']:
        flist = gen_flist_simu(chime_data_dir, stage, ext=True)
        export_flist = list()
        mkdir_p(os.path.join(dest_dir, stage))
        for f in tqdm.tqdm(flist, desc='Generating data for {}'.format(stage)):
            clean_audio = get_audio_data(f, '.Clean')
            noise_audio = get_audio_data(f, '.Noise')
            X = stft(clean_audio, time_dim=1).transpose((1, 0, 2))
            N = stft(noise_audio, time_dim=1).transpose((1, 0, 2))
            IBM_X, IBM_N = estimate_IBM(X, N)
            Y_abs = np.abs(X + N)
            export_dict = {
                'IBM_X': IBM_X.astype(np.float32),
                'IBM_N': IBM_N.astype(np.float32),
                'Y_abs': Y_abs.astype(np.float32)
            }
            export_name = os.path.join(dest_dir, stage, f.split('/')[-1])
            with open(export_name, 'wb') as fid:
                pickle.dump(export_dict, fid)
            export_flist.append(os.path.join(stage, f.split('/')[-1]))
        with open(os.path.join(dest_dir, 'flist_{}.json'.format(stage)),
                  'w') as fid:
            json.dump(export_flist, fid, indent=4)
Beispiel #7
0
def load_arrays_from_wav(base_dir, fname, idx, delay=0, divisor=16):

    kwargs = {'time_dim': 1, 'size': 512, 'shift': 160, 'window_length': 400}

    filename = os.path.join(base_dir, fname[idx])
    audio = np.expand_dims(audioread(filename), axis=0)
    if delay > 0:
        audio = np.roll(audio, delay, axis=-1)

    if audio.ndim == 3:
        complex_spec = stft(audio[:, 0], **kwargs)
        feats = complex_spec / 2
        feats += stft(audio[:, 1], **kwargs) / 2
    else:
        complex_spec = stft(audio, **kwargs)
        feats = complex_spec

    # multiple-of-16-ify
    if divisor > 1:
        feats = feats[:, :, :-(feats.shape[-1] % divisor)]
        pad = ((0, 0), (0, divisor - feats.shape[1] % divisor), (0, 0))
        feats = np.pad(feats, pad, 'edge')

    return feats.astype(np.complex64)
Beispiel #8
0
def apply_beamfomer(args):
    estimator = MaskEstimator(num_bins)
    mask_computer = MaskComputer(estimator, args.model)

    func_bf = mvdr_wrapper_on_masks if not args.gev else \
            gev_wrapper_on_masks

    f = args.flist.strip()
    tokens = f.split('/')
    noisy_samples = load_multichannel_data(args.flist)
    noisy_specs = stft(noisy_samples, time_dim=1).transpose((1, 0, 2))
    mask_n, mask_x = mask_computer.compute_masks(
        np.abs(noisy_specs).astype(np.float32))
    mask_n = np.median(mask_n, axis=1)
    mask_x = np.median(mask_x, axis=1)
    clean_specs = func_bf(noisy_specs, mask_n, mask_x)
    clean_samples = istft(clean_specs)
    #print('dumps to {}/{}.wav'.format(args.dump, tokens[-1]))
    audiowrite(clean_samples, '{}/{}'.format(args.dump, tokens[-1]), 16000,
               True, True)
Beispiel #9
0
def write_wav(magnitude,
              phase,
              filename,
              exponentiate=True,
              griffin_lim=False):
    if exponentiate:
        magnitude = np.exp(magnitude)

    complex_spec = magnitude * np.exp(1j * phase)

    kwargs = {'size': 512, 'shift': 64, 'window_length': 512}
    resynth = istft(complex_spec, **kwargs)

    if griffin_lim:
        for i in range(10):
            complex_spec = magnitude * np.exp(
                1j * np.angle(stft(resynth, **kwargs)))
            resynth = istft(complex_spec, **kwargs)

    audiowrite(resynth, filename)
Beispiel #10
0
def single_normal():
    # audio_data = get_audio_nochime('data/new_dataset/216m/2m_pub_new', ch_range=range(1, 9), fs=16000)
    # noise_data = get_audio_nochime('data/new_dataset/blstm_noise/noise_124', ch_range=range(1, 9), fs=16000)
    # audio_data = get_audio_nochime(args.data_directory, ch_range=range(1, 3), fs=16000)
    t_io = 0
    t_net = 0
    t_beamform = 0

    # check execution time
    with Timer() as t:
        audio_data = get_audio_nochime(args.data_directory,
                                       ch_range=range(1, 3),
                                       fs=16000)
        context_samples = 0
        print("audio_data: ", audio_data.shape, end="\n")
        # for i in range (0, 8):
        #     print(audio_data[i][1])
    t_io += t.msecs

    Y = stft(audio_data, time_dim=1).transpose((1, 0, 2))
    # N = stft(noise_data, time_dim=1).transpose((1, 0, 2))

    Y_phase = np.divide(Y, abs(Y))
    print("Y: ", Y.shape, "Y_phase: ", Y_phase.shape, end="\n")
    # Y_var with or without chainer Variable class doesn't give any different
    Y_var = Variable(np.abs(Y).astype(np.float32))

    # N_var = Variable(np.abs(N).astype(np.float32), True)
    # blstm_noise = Variable(np.abs(blstm_noise).astype(np.float32), True)

    with Timer() as t:
        # mask estimation
        N_masks, X_masks = model.calc_masks(Y_var)
        # Noise_masks = model.calc_mask_noise(N_var)
        print("N_masks: ", N_masks.shape, end="\n")
        N_masks.to_cpu()
        X_masks.to_cpu()
    t_net += t.msecs
    # Noise_masks.to_cpu()

    with Timer() as t:
        N_mask = np.median(N_masks.data, axis=1)
        X_mask = np.median(X_masks.data, axis=1)

        # Noise_mask = np.median(Noise_masks.data, axis=1)

        # signal = audioread('data/new_dataset/216m/2m_pub_new' + '.CH{}.wav'.format(ch), sample_rate=16000)
        # noise = audioread('data/new_dataset/gevnoise/gevnoise' + '.CH{}.wav'.format(ch), sample_rate=16000)
        # signal_ = stft(signal)
        # noise_ = stft(noise)
        #
        # signal_phase = np.divide(signal, abs(signal_))
        # noise_masks = model.calc_mask_noise(noise_)
        # noise_to = np.multiply(noise_masks.data, signal_)
        # noise_to = np.multiply(noise_to, signal_phase)
        # audiowrite(istft(noise_to)[context_samples:],
        #            "/home/hipo/workspace/BeamSaber/result/noise/noise_to_.CH{}.wav".format(ch), 16000, True, True)

        Noise = np.multiply(N_masks.data, Y)
        Noise = np.multiply(Noise, Y_phase)
        # Y_phase_med = np.median(Y_phase, axis=1)
        # print(Noise.shape)
        # for ch in range(0, 8):
        #     audiowrite(istft(Noise[:,ch,:])[context_samples:],
        #                "/home/hipo/workspace/BeamSaber/result/noise/2mnoise_.CH{}.wav".format(ch), 16000, True, True)
        Noise = np.median(Noise, axis=1)

        # print("N_mask: ", N_mask.shape, "X_mask: ", X_mask.shape, "Y_phase: ", Y_phase.shape, end="\n")
        Y_hat = gev_wrapper_on_masks(Y, N_mask, X_mask)
        # print(Y_hat.shape)
        # print("Noise: ", Noise.shape)
    t_beamform += t.msecs

    with Timer() as t:
        audiowrite(
            istft(Noise)[context_samples:],
            "/media/hipo/lento/workspace/BeamSaber/tools/enhancement/gev/PublicFOMLSA/sample/{}_noise.wav"
            .format(args.exNum), 16000, True, True)
        audiowrite(
            istft(Y_hat)[context_samples:],
            "/media/hipo/lento/workspace/BeamSaber/tools/enhancement/gev/PublicFOMLSA/sample/{}_gev.wav"
            .format(args.exNum), 16000, True, True)
    t_io += t.msecs
    print(
        'Timings: I/O: {:.2f}s | Net: {:.2f}s | Beamformer: {:.2f}s | Total: {:.2f}s'
        .format(t_io / 1000, t_net / 1000, t_beamform / 1000,
                ((t_io + t_net + t_beamform) / 1000)))
Beispiel #11
0
        )))

t_io = 0
t_net = 0
t_beamform = 0
# Beamform loop
for cur_line in tqdm(flist):
    with Timer() as t:
        if scenario == 'simu':
            audio_data = get_audio_data(cur_line)
            context_samples = 0
        elif scenario == 'real':
            audio_data, context_samples = get_audio_data_with_context(
                    cur_line[0], cur_line[1], cur_line[2])
    t_io += t.msecs
    Y = stft(audio_data, time_dim=1).transpose((1, 0, 2))
    Y_var = Variable(np.abs(Y).astype(np.float32))
    if args.gpu >= 0:
        Y_var.to_gpu(args.gpu)
    with Timer() as t:
        N_masks, X_masks = model.calc_masks(Y_var)
        N_masks.to_cpu()
        X_masks.to_cpu()
    t_net += t.msecs

    with Timer() as t:
        N_mask = np.median(N_masks.data, axis=1)
        X_mask = np.median(X_masks.data, axis=1)
        Y_hat_dicts = bf_wrapper_on_masks(Y, N_mask, X_mask, beamformers=beamformers)
    t_beamform += t.msecs
Beispiel #12
0
t_io = 0
t_net = 0
t_beamform = 0

with Timer() as t:

    audio_data = get_audio_nochime('new_dataset/2m/2m_pub_new',
                                   ch_range=range(1, 9),
                                   fs=48000)
    # audio_data = get_audio_nochime('new_dataset/new_audio/AUDIO_RECORDING', ch_range=range(1, 9), fs=49000)

# calculate the time for load the audio files
t_io += t.msecs

# change the audio files into frequency domain
Y = stft(audio_data, time_dim=1).transpose((1, 0, 2))
print(audio_data.shape, type(audio_data))

Y_var = Variable(np.abs(Y).astype(np.float32), True)

# mask estimation
with Timer() as t:
    N_masks, X_masks = model.calc_masks(Y_var)
    N_masks.to_cpu()
    X_masks.to_cpu()
t_net += t.msecs

with Timer() as t:
    N_mask = np.median(N_masks.data, axis=1)
    X_mask = np.median(X_masks.data, axis=1)
    print("Y: ",
Beispiel #13
0
def prepare_other_training_data(train_dir, dest_dir):
    start = 0
    chime_data_dir = os.path.join(train_dir[:-1], 'tr')
    print(chime_data_dir)

    for stage in ['tr', 'dt']:
        if stage is 'dt':
            chime_data_dir = os.path.join(train_dir[:-1], 'dt')
            print(chime_data_dir)
        reset_counter = 0
        # flist = gen_flist_simu(chime_data_dir, stage, ext=True)
        flist = [f for f in listdir(chime_data_dir) if isfile(join(chime_data_dir, f))]
        # print(flist)
        export_flist = list()
        mkdir_p(os.path.join(dest_dir, stage))
        noise_data = audioread('/media/hipo/lento/Dataset/single file/noise_files/all_noise.wav')
        print("noise_data size:", noise_data.shape[0])
        for f in tqdm.tqdm(flist, desc='Generating data for {}'.format(stage)):
            # clean_audio = get_audio_data(f)
            path = os.path.join(chime_data_dir, f)
            clean_audio = get_audio_single(path)
            # clean_audioa = audioread(path)
            # clean_audiob = audioread(path)
            # multi_track = list()
            # multi_track.append(clean_audioa[None, :])
            # multi_track.append(clean_audiob[None, :])
            # multi_track = np.concatenate(multi_track, axis=0)
            # multi_track = multi_track.astype(np.float32)
            # print(multi_track.shape)
            chime_size = audioread(path)

            noise_files = list()
            end = chime_size.shape[0] + start
            if end > noise_data.shape[0]:
                print("reset counter: ", reset_counter + 1)
                start = 0
                end = chime_size.shape[0] + start
            for i in range(1, 2):
                y = noise_data[start:end]
            start = end
            noise_files.append(y[None, :])

            noise_files = np.concatenate(noise_files, axis=0)
            noise_files = noise_files.astype(np.float32)
            noise_audio = noise_files
            # print("speech size: ", multi_track.shape, "noise size: ", noise_audio.shape)
            X = stft(clean_audio, time_dim=1).transpose((1, 0, 2))
            N = stft(noise_audio, time_dim=1).transpose((1, 0, 2))

            IBM_X, IBM_N = estimate_IBM(X, N)
            Y_abs = np.abs(X + N)
            export_dict = {
                'IBM_X': IBM_X.astype(np.float32),
                'IBM_N': IBM_N.astype(np.float32),
                'Y_abs': Y_abs.astype(np.float32)
            }
            export_name = os.path.join(dest_dir, stage, f.split('/')[-1])
            with open(export_name, 'wb') as fid:
                pickle.dump(export_dict, fid)
            export_flist.append(os.path.join(stage, f.split('/')[-1]))
        with open(os.path.join(dest_dir, 'flist_{}.json'.format(stage)),
                  'w') as fid:
            json.dump(export_flist, fid, indent=4)
Beispiel #14
0
    )))

t_io = 0
t_net = 0
t_beamform = 0
# Beamform loop
for cur_line in tqdm(flist):
    with Timer() as t:
        if scenario == 'simu':
            audio_data = get_audio_data(cur_line)
            context_samples = 0
        elif scenario == 'real':
            audio_data, context_samples = get_audio_data_with_context(
                    cur_line[0], cur_line[1], cur_line[2])
    t_io += t.msecs
    Y = stft(audio_data, time_dim=1).transpose((1, 0, 2))
    Y_var = Variable(np.abs(Y).astype(np.float32), True)
    if args.gpu >= 0:
        Y_var.to_gpu(args.gpu)
    with Timer() as t:
        N_masks, X_masks = model.calc_masks(Y_var)
        N_masks.to_cpu()
        X_masks.to_cpu()
    t_net += t.msecs

    with Timer() as t:
        N_mask = np.median(N_masks.data, axis=1)
        X_mask = np.median(X_masks.data, axis=1)
        Y_hat = gev_wrapper_on_masks(Y, N_mask, X_mask)
    t_beamform += t.msecs