Ejemplo n.º 1
0
def no_separation(args):
    """Write out un-separated mixture as baseline. 
    """
    workspace = args.workspace

    out_dir = os.path.join(workspace, "separated_wavs", "no_separation")
    pp_data.create_folder(out_dir)

    audio_dir = os.path.join(workspace, "mixed_audio", "testing")
    names = os.listdir(audio_dir)

    for na in names:
        if '.mix_0db.wav' in na:
            print(na)
            audio_path = os.path.join(audio_dir, na)
            (bg_audio, event_audio, fs) = pp_data.read_audio_stereo(audio_path)
            mixed_audio = bg_audio + event_audio

            bare_na = os.path.splitext(os.path.splitext(na)[0])[0]
            pp_data.write_audio(os.path.join(out_dir, bare_na + ".sep_bg.wav"),
                                mixed_audio, fs)
            pp_data.write_audio(
                os.path.join(out_dir, bare_na + ".sep_event.wav"), mixed_audio,
                fs)

    print("Write out finished!")
Ejemplo n.º 2
0
def create_room(source_file, noise_file, dist):

    (clean, fs) = pp.read_audio(source_file)
    (noise, _) = pp.read_audio(noise_file)

    for file in os.listdir(os.path.join("data_eval", "dnn1_in")):
        file_path = os.path.join("data_eval", "dnn1_in", file)
        os.remove(file_path)

    for n in range(len(dist)):

        mixed, noise_new, clean_new, s2nr = set_microphone_at_distance(
            clean, noise, fs, dist[n])

        # s2nr = 1 / (1 + (1 / float(snr)))

        mixed_name = "mix_%s_%s" % (str(
            dist[n]), os.path.basename(source_file))
        clean_name = "clean_%s_%s" % (str(
            dist[n]), os.path.basename(source_file))

        mixed_path = os.path.join('data_eval/dnn1_in', mixed_name)
        clean_path = os.path.join('data_eval/dnn1_in', clean_name)

        pp.write_audio(mixed_path, mixed, fs)
Ejemplo n.º 3
0
def demo(args):
    """Inference all test data, write out recovered wavs to disk.

    Args:
      workspace: str, path of workspace.
      tr_snr: float, training SNR.
      te_snr: float, testing SNR.
      n_concat: int, number of frames to concatenta, should equal to n_concat
          in the training stage.
      iter: int, iteration of model to load.
      visualize: bool, plot enhanced spectrogram for debug.
    """
    print(args)
    workspace = args.workspace
    tr_snr = args.tr_snr
    te_snr = args.te_snr
    n_concat = args.n_concat
    iter = args.iteration

    n_window = cfg.n_window
    n_overlap = cfg.n_overlap
    fs = cfg.sample_rate
    scale = True

    # Load model.
    model_path = os.path.join(workspace, "models", "%ddb" % int(tr_snr), "FullyCNN.h5")
    model = load_model(model_path)

    # Load test data.
    if args.online:
        print('recording....')
        recordfile = 'record.wav'
        my_record(recordfile, 16000, 2)
        print('recording end')
        (data, _) = pp_data.read_audio(recordfile, 16000)
    else:
        testfile = 'data_cache/test_speech/1568253725.587787.wav'
        (data, _) = pp_data.read_audio(testfile, 16000)
    mixed_complx_x = pp_data.calc_sp(data, mode='complex')
    mixed_x, mixed_phase = divide_magphase(mixed_complx_x, power=1)

    # Predict.
    pred = model.predict(mixed_x)
    # Recover enhanced wav.
    pred_sp = pred  # np.exp(pred)
    n_window = cfg.n_window
    n_overlap = cfg.n_overlap
    hop_size = n_window - n_overlap
    ham_win = np.sqrt(np.hanning(n_window))
    stft_reconstructed_clean = merge_magphase(pred_sp, mixed_phase)
    stft_reconstructed_clean = stft_reconstructed_clean.T
    signal_reconstructed_clean = librosa.istft(stft_reconstructed_clean, hop_length=hop_size, window=ham_win)
    signal_reconstructed_clean = signal_reconstructed_clean*32768
    s = signal_reconstructed_clean.astype('int16')

    # Write out enhanced wav.
    # out_path = os.path.join(workspace, "enh_wavs", "test", "%ddb" % int(te_snr), "%s.enh.wav" % na)
    # pp_data.create_folder(os.path.dirname(out_path))
    pp_data.write_audio('1568253725.587787ehs.wav', s, fs)
Ejemplo n.º 4
0
def main(from_dir, to_dir, sr):
    from_paths = wav_paths(from_dir)
    for from_p in tqdm(from_paths, 'Resampling audio'):
        rel_p = PurePath(from_p).relative_to(from_dir)
        to_p = to_dir / rel_p
        os.makedirs(to_p.parent, exist_ok=True)

        wav, _ = read_audio(from_p, sr)
        write_audio(to_p, wav, sr)          
Ejemplo n.º 5
0
def DAB_generate(source_audio, out_folder, name):

    shoebox = pra.ShoeBox(
        room_dimensions,
        absorption=wall_absorption,
        fs=fs,
        max_order=15,
    )

    # number of microphones
    M = 4

    source_position = np.array([
        random.uniform(0, room_dimensions[0]),
        random.uniform(0, room_dimensions[1])
    ])

    distances = np.random.randint(1, 20, M)

    mic_pos = []
    for m in range(M):
        mic_distance = distances[m]
        mic_m = guess_microphone(
            source_position, mic_distance
        )  # random way: guess microphone position until it's in the room: very long time for small rooms
        mic_pos.append(mic_m)

    out_mic_file = os.path.join(out_folder, 'log_%s.txt' % name)

    if os.path.exists(out_mic_file):
        os.remove(out_mic_file)
    f1 = open(out_mic_file, 'w')
    for l in range(M):
        f1.write("%s, %f\n" % (str(mic_pos[l]), distances[l]))

    Lg_t = 0.100  # filter size in seconds
    Lg = np.ceil(Lg_t * fs)  # in samples
    fft_len = 512
    mics = pra.Beamformer(np.asarray(mic_pos).T, shoebox.fs, N=fft_len, Lg=Lg)

    shoebox.add_source(source_position, signal=source_audio)
    shoebox.add_microphone_array(mics)

    shoebox.compute_rir()
    shoebox.simulate()

    # ADDING NOISE AND SAVING

    for n in range(M):
        signal = np.asarray(shoebox.mic_array.signals[n, :], dtype=float)
        signal = pra.utilities.normalize(signal, bits=16)
        mixed_signal = add_noise(source_audio, signal)
        mixed_signal = np.array(mixed_signal, dtype=np.int16)
        mixed_file = os.path.join(out_folder, 'mix%d_%s' % (n, name))
        pp.write_audio(mixed_file, mixed_signal, fs)
Ejemplo n.º 6
0
def DS_generate(source_audio, out_folder, name):

    # Create the shoebox
    shoebox = pra.ShoeBox(
        room_dimensions,
        absorption=wall_absorption,
        fs=fs,
        max_order=15,
    )
    mic_distance = random.randint(1, 20)  # distance from source to microphone
    source_position = np.array([
        random.uniform(0, room_dimensions[0]),
        random.uniform(0, room_dimensions[1])
    ])

    # random way: guess microphone position until it's in the room: very long time for small rooms
    mic_in_room = False
    while mic_in_room == False:
        theta = random.uniform(0, 2 * math.pi)
        mic_position = source_position - mic_distance * np.array(
            [math.cos(theta), math.sin(theta)])
        print(mic_position)
        if (0 <= mic_position[0] <= room_dimensions[0]) and (
                0 <= mic_position[1] <= room_dimensions[1]):
            mic_in_room = True

    # source and mic locations
    shoebox.add_source(source_position, signal=source_audio)
    shoebox.add_microphone_array(
        pra.MicrophoneArray(np.array([mic_position]).T, shoebox.fs))

    shoebox.simulate()

    signal = shoebox.mic_array.signals[0, :]
    mixed_signal = add_noise(source_audio, signal)
    mixed_signal = pra.utilities.normalize(mixed_signal, bits=16)
    mixed_signal = np.array(mixed_signal, dtype=np.int16)

    pp.write_audio(os.path.join(out_folder, 'mix_%s' % name), mixed_signal, fs)
def separate(args, bgn_iter, fin_iter, interval):
    workspace = cfg.workspace
    events = cfg.events
    te_fold = cfg.te_fold
    n_events = args.n_events
    n_window = cfg.n_window
    n_overlap = cfg.n_overlap
    fs = cfg.sample_rate
    clip_duration = cfg.clip_duration
    snr = args.snr

    # Load ground truth data.
    feature_dir = os.path.join(workspace, "features", "logmel",
                               "n_events=%d" % n_events)
    yaml_dir = os.path.join(workspace, "mixed_audio", "n_events=%d" % n_events)
    (tr_x, tr_at_y, tr_sed_y, tr_na_list, te_x, te_at_y, te_sed_y,
     te_na_list) = pp_data.load_data(feature_dir=feature_dir,
                                     yaml_dir=yaml_dir,
                                     te_fold=te_fold,
                                     snr=snr,
                                     is_scale=is_scale)

    at_y = te_at_y
    sed_y = te_sed_y
    na_list = te_na_list

    # Load and sum
    preds_dir = os.path.join(workspace, "preds",
                             pp_data.get_filename(__file__),
                             "n_events=%d" % n_events, "fold=%d" % te_fold,
                             "snr=%d" % snr)

    at_probs_list, seg_masks_list = [], []
    for iter in xrange(bgn_iter, fin_iter, interval):
        seg_masks_path = os.path.join(preds_dir, "md%d_iters" % iter,
                                      "seg_masks.p")
        seg_masks = cPickle.load(open(seg_masks_path, 'rb'))
        seg_masks_list.append(seg_masks)
    seg_masks = np.mean(seg_masks_list,
                        axis=0)  # (n_clips, n_classes, n_time, n_freq)

    print(seg_masks.shape)

    #
    audio_dir = os.path.join(workspace, "mixed_audio",
                             "n_events=%d" % n_events)

    sep_dir = os.path.join(workspace, "sep_audio",
                           pp_data.get_filename(__file__),
                           "n_events=%d" % n_events, "fold=%d" % te_fold,
                           "snr=%d" % snr)
    pp_data.create_folder(sep_dir)

    ham_win = np.hamming(n_window)
    recover_scaler = np.sqrt((ham_win**2).sum())
    melW = librosa.filters.mel(sr=fs,
                               n_fft=n_window,
                               n_mels=64,
                               fmin=0.,
                               fmax=fs / 2)
    inverse_melW = get_inverse_W(melW)  # (64, 513)

    seg_stats = {}
    for e in events:
        seg_stats[e] = {
            'fvalue': [],
            'auc': [],
            'iou': [],
            'hit': [],
            'fa': [],
            'tp': [],
            'fn': [],
            'fp': []
        }

    cnt = 0
    for (i1, na) in enumerate(na_list):
        bare_na = os.path.splitext(na)[0]
        audio_path = os.path.join(audio_dir, "%s.wav" % bare_na)
        (stereo_audio, _) = pp_data.read_stereo_audio(audio_path, target_fs=fs)
        event_audio = stereo_audio[:, 0]
        noise_audio = stereo_audio[:, 1]
        mixed_audio = event_audio + noise_audio

        mixed_cmplx_sp = pp_data.calc_sp(mixed_audio, fs, ham_win, n_window,
                                         n_overlap)
        mixed_sp = np.abs(mixed_cmplx_sp)
        event_sp = np.abs(
            pp_data.calc_sp(event_audio, fs, ham_win, n_window, n_overlap))
        noise_sp = np.abs(
            pp_data.calc_sp(noise_audio, fs, ham_win, n_window, n_overlap))

        sm = seg_masks[i1]  # (n_classes, n_time, n_freq)
        sm_upsampled = np.dot(sm, inverse_melW)  # (n_classes, n_time, 513)

        print(na)

        # Write out separated events.
        for j1 in xrange(len(events)):
            if at_y[i1][j1] == 1:
                (fvalue, auc, iou, tp, fn, fp) = fvalue_iou(sm_upsampled[j1],
                                                            event_sp,
                                                            noise_sp,
                                                            sed_y[i1, :, j1],
                                                            seg_thres,
                                                            inside_only=True)
                (hit, fa) = hit_fa(sm_upsampled[j1],
                                   event_sp,
                                   noise_sp,
                                   sed_y[i1, :, j1],
                                   seg_thres,
                                   inside_only=True)
                seg_stats[events[j1]]['fvalue'].append(fvalue)
                seg_stats[events[j1]]['auc'].append(auc)
                seg_stats[events[j1]]['iou'].append(iou)
                seg_stats[events[j1]]['hit'].append(hit)
                seg_stats[events[j1]]['fa'].append(fa)
                seg_stats[events[j1]]['tp'].append(tp)
                seg_stats[events[j1]]['fn'].append(fn)
                seg_stats[events[j1]]['fp'].append(fp)

                sep_event_sp = sm_upsampled[j1] * mixed_sp
                sep_event_s = spectrogram_to_wave.recover_wav(
                    sep_event_sp,
                    mixed_cmplx_sp,
                    n_overlap=n_overlap,
                    winfunc=np.hamming,
                    wav_len=int(fs * clip_duration))
                sep_event_s *= recover_scaler

                out_event_audio_path = os.path.join(
                    sep_dir, "%s.%s.wav" % (bare_na, events[j1]))
                pp_data.write_audio(out_event_audio_path, sep_event_s, fs)

        # Write out separated noise.
        sm_noise_upsampled = np.clip(1. - np.sum(sm_upsampled, axis=0), 0., 1.)
        sep_noise_sp = sm_noise_upsampled * mixed_sp
        sep_noise_s = spectrogram_to_wave.recover_wav(sep_noise_sp,
                                                      mixed_cmplx_sp,
                                                      n_overlap=n_overlap,
                                                      winfunc=np.hamming,
                                                      wav_len=int(
                                                          fs * clip_duration))
        sep_noise_s *= recover_scaler
        out_noise_audio_path = os.path.join(sep_dir, "%s.noise.wav" % bare_na)
        pp_data.write_audio(out_noise_audio_path, sep_noise_s, fs)

        cnt += 1
        # if cnt == 2: break


    fvalues, aucs, ious, hits, fas, tps, fns, fps = [], [], [], [], [], [], [], []
    for e in events:
        fvalues.append(np.mean(seg_stats[e]['fvalue']))
        ious.append(np.mean(seg_stats[e]['iou']))
        aucs.append(np.mean(seg_stats[e]['auc']))
        hits.append(np.mean(seg_stats[e]['hit']))
        fas.append(np.mean(seg_stats[e]['fa']))
        tps.append(np.mean(seg_stats[e]['tp']))
        fns.append(np.mean(seg_stats[e]['fn']))
        fps.append(np.mean(seg_stats[e]['fp']))

    logging.info("%sfvalue\tauc\tiou\tHit\tFa\tHit-Fa\tTP\tFN\tFP" %
                 ("".ljust(16)))
    logging.info(
        "%s*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f" %
        ("*Avg. of each".ljust(16), np.mean(fvalues), np.mean(aucs),
         np.mean(ious), np.mean(hits), np.mean(fas), np.mean(hits) -
         np.mean(fas), np.mean(tps), np.mean(fns), np.mean(fps)))
    for i1 in xrange(len(events)):
        logging.info(
            "%s%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f" %
            (events[i1].ljust(16), fvalues[i1], aucs[i1], ious[i1], hits[i1],
             fas[i1], hits[i1] - fas[i1], tps[i1], fns[i1], fps[i1]))
Ejemplo n.º 8
0
def inference_wiener(args):
    workspace = args.workspace
    iter = args.iteration
    stack_num = args.stack_num
    filename = args.filename
    mini_num = args.mini_num
    visualize = args.visualize
    cuda = args.use_cuda and torch.cuda.is_available()
    print("cuda:", cuda)

    sample_rate = cfg.sample_rate
    fft_size = cfg.fft_size
    hop_size = cfg.hop_size
    window_type = cfg.window_type

    if window_type == 'hamming':
        window = np.hamming(fft_size)

    # Audio
    audio_dir = "/vol/vssp/msos/qk/workspaces/speech_enhancement/mixed_audios/spectrogram/test/0db"
    # audio_dir = "/user/HS229/qk00006/my_code2015.5-/python/pub_speech_enhancement/mixture2clean_dnn/workspace/mixed_audios/spectrogram/test/0db"
    names = os.listdir(audio_dir)

    # Load model.
    target_type = ['speech', 'noise']
    model_dict = {}
    for e in target_type:
        n_freq = 257
        model = DNN(stack_num, n_freq)
        model_path = os.path.join(workspace, "models", filename, e,
                                  "md_%d_iters.tar" % iter)
        checkpoint = torch.load(model_path)
        model.load_state_dict(checkpoint['state_dict'])

        # Move model to GPU.
        if cuda:
            model.cuda()
        model.eval()

        model_dict[e] = model

    # Load scalar
    scalar_path = os.path.join(workspace, "scalars", filename, "scalar.p")
    (mean_, std_) = cPickle.load(open(scalar_path, 'rb'))
    mean_ = move_data_to_gpu(mean_, cuda, volatile=True)
    std_ = move_data_to_gpu(std_, cuda, volatile=True)

    if mini_num > 0:
        n_every = len(names) / mini_num
    else:
        n_every = 1

    out_wav_dir = os.path.join(workspace, "enh_wavs", filename)
    pp_data.create_folder(out_wav_dir)

    for (cnt, name) in enumerate(names):
        if cnt % n_every == 0:
            audio_path = os.path.join(audio_dir, name)
            (audio, _) = pp_data.read_audio(audio_path, sample_rate)

            audio = pp_data.normalize(audio)
            cmplx_sp = pp_data.calc_sp(audio, fft_size, hop_size, window)
            x = np.abs(cmplx_sp)

            # Process data.
            n_pad = (stack_num - 1) / 2
            x = pp_data.pad_with_border(x, n_pad)
            x = pp_data.mat_2d_to_3d(x, stack_num, hop=1)

            # Predict.
            pred_dict = {}
            for e in target_type:
                pred = forward(model_dict[e], x, mean_, std_, cuda)
                pred = pred.data.cpu().numpy()
                pred_dict[e] = pred
            print(cnt, name)

            # Wiener filter.
            pred_mag_sp = pred_dict['speech'] / (
                pred_dict['speech'] + pred_dict['noise']) * np.abs(cmplx_sp)

            pred_cmplx_sp = stft.real_to_complex(pred_mag_sp, cmplx_sp)
            frames = stft.istft(pred_cmplx_sp)

            cola_constant = stft.get_cola_constant(hop_size, window)
            seq = stft.overlap_add(frames, hop_size, cola_constant)
            seq = seq[0:len(audio)]

            # Write out wav
            out_wav_path = os.path.join(out_wav_dir, name)
            pp_data.write_audio(out_wav_path, seq, sample_rate)
            print("Write out wav to: %s" % out_wav_path)

            if visualize:
                vmin = -5.
                vmax = 5.
                fig, axs = plt.subplots(3, 1, sharex=True)
                axs[0].matshow(np.log(np.abs(cmplx_sp)).T,
                               origin='lower',
                               aspect='auto',
                               cmap='jet')
                axs[1].matshow(np.log(np.abs(pred_dict['speech'])).T,
                               origin='lower',
                               aspect='auto',
                               cmap='jet')
                axs[2].matshow(np.log(np.abs(pred_dict['noise'])).T,
                               origin='lower',
                               aspect='auto',
                               cmap='jet')
                plt.show()
Ejemplo n.º 9
0
def inference(args):
    workspace = args.workspace
    iter = args.iteration
    stack_num = args.stack_num
    filename = args.filename
    mini_num = args.mini_num
    visualize = args.visualize
    cuda = args.use_cuda and torch.cuda.is_available()
    print("cuda:", cuda)
    audio_type = 'speech'
    
    sample_rate = cfg.sample_rate
    fft_size = cfg.fft_size
    hop_size = cfg.hop_size
    window_type = cfg.window_type

    if window_type == 'hamming':
        window = np.hamming(fft_size)

    # Audio
    audio_dir = "/vol/vssp/msos/qk/workspaces/speech_enhancement/mixed_audios/spectrogram/test/0db"
    # audio_dir = "/user/HS229/qk00006/my_code2015.5-/python/pub_speech_enhancement/mixture2clean_dnn/workspace/mixed_audios/spectrogram/test/0db"
    names = os.listdir(audio_dir)
    
    speech_dir = "/vol/vssp/msos/qk/workspaces/speech_enhancement/timit_wavs/subtest"
    
    # Load model
    model_path = os.path.join(workspace, "models", filename, audio_type, "md_%d_iters.tar" % iter)
    n_freq = 257
    model = DNN(stack_num, n_freq)
    checkpoint = torch.load(model_path)
    model.load_state_dict(checkpoint['state_dict'])
    
    if cuda:
        model.cuda()
        
    # Load scalar
    scalar_path = os.path.join(workspace, "scalars", filename, "scalar.p")
    (mean_, std_) = cPickle.load(open(scalar_path, 'rb'))
    mean_ = move_data_to_gpu(mean_, cuda, volatile=True)
    std_ = move_data_to_gpu(std_, cuda, volatile=True)
    
    if mini_num > 0:
        n_every = len(names) / mini_num
    else:
        n_every = 1
        
    out_wav_dir = os.path.join(workspace, "enh_wavs", filename)
    pp_data.create_folder(out_wav_dir)
    
    dft = pp_data.DFT(fft_size, cuda)
        
    for (cnt, name) in enumerate(names):
        if cnt % n_every == 0:
            audio_path = os.path.join(audio_dir, name)
            (audio0, _) = pp_data.read_audio(audio_path, sample_rate)
            
            audio = pp_data.normalize(audio0)
            
            # Enframe
            frames = stft.enframe(audio, fft_size, hop_size)
            
            # Process data. 
            n_pad = (stack_num - 1) / 2
            x = pp_data.pad_with_border(frames, n_pad)
            x = pp_data.mat_2d_to_3d(x, stack_num, hop=1)
            
            pred_frames = forward(model, x, mean_, std_, cuda)
            
            pred_frames = pred_frames.data.cpu().numpy()
            
            # cola_constant = 0.5
            # seq = stft.overlap_add(pred_frames, hop_size, cola_constant)
            
            pred_frames *= window
            
            cola_constant = stft.get_cola_constant(hop_size, window)
            seq = stft.overlap_add(pred_frames, hop_size, cola_constant)
            seq = seq[0 : len(audio)]
            
            
            # Write out wav
            out_wav_path = os.path.join(out_wav_dir, name)
            pp_data.write_audio(out_wav_path, seq, sample_rate)
            print("Write out wav to: %s" % out_wav_path)
            
            if visualize:
                
                clean_audio_path = os.path.join(speech_dir, name.split('.')[0] + ".WAV")
                (clean_audio, _) = pp_data.read_audio(clean_audio_path, sample_rate)
                clean_audio = pp_data.normalize(clean_audio)
                clean_frames = stft.enframe(clean_audio, fft_size, hop_size)
                
                mix_sp = np.abs(np.fft.rfft(frames * window, norm='ortho'))
                enh_sp = np.abs(np.fft.rfft(pred_frames * window, norm='ortho'))
                clean_sp = np.abs(np.fft.rfft(clean_frames * window, norm='ortho'))
                
                K = 10
                fig, axs = plt.subplots(K/2,2, sharex=True)
                for k in range(K):
                    axs[k / 2, k % 2].plot(frames[k+100], color='y')
                    axs[k / 2, k % 2].plot(clean_frames[k+100], color='r')
                    axs[k / 2, k % 2].plot(pred_frames[k+100], color='b')
                plt.show()
                
                # import crash
                # asdf
                
                vmin = -5.
                vmax = 5.
                fig, axs = plt.subplots(3,1, sharex=True)
                axs[0].matshow(np.log(np.abs(mix_sp)).T, origin='lower', aspect='auto', cmap='jet', vmin=vmin, vmax=vmax)
                axs[1].matshow(np.log(np.abs(clean_sp)).T, origin='lower', aspect='auto', cmap='jet', vmin=vmin, vmax=vmax)
                axs[2].matshow(np.log(np.abs(enh_sp)).T, origin='lower', aspect='auto', cmap='jet', vmin=vmin, vmax=vmax)
                plt.show()
Ejemplo n.º 10
0
def test(args):
    """Inference all test data, write out recovered wavs to disk.

	Args:
	  workspace: str, path of workspace.
	  tr_snr: float, training SNR.
	  te_snr: float, testing SNR.
	  n_concat: int, number of frames to concatenta, should equal to n_concat
		  in the training stage.
	  iter: int, iteration of model to load.
	  visualize: bool, plot enhanced spectrogram for debug.
	"""
    print(args)
    workspace = args.workspace
    tr_snr = args.tr_snr
    te_snr = args.te_snr
    n_concat = args.n_concat
    iter = args.iteration

    n_window = cfg.n_window
    n_overlap = cfg.n_overlap
    fs = cfg.sample_rate
    scale = True

    # Load model.
    model_path = os.path.join(workspace, "models", "%ddb" % int(tr_snr),
                              "FullyCNN.h5")
    model = load_model(model_path)

    # Load test data.
    feat_dir = os.path.join(workspace, "features", "spectrogram", "test",
                            "%ddb" % int(te_snr))
    names = os.listdir(feat_dir)

    for (cnt, na) in enumerate(names):
        # Load feature.
        feat_path = os.path.join(feat_dir, na)
        data = pickle.load(open(feat_path, 'rb'))
        [mixed_cmplx_x, speech_cmplx_x, noise_x, alpha, na] = data

        mixed_x, mixed_phase = divide_magphase(mixed_cmplx_x,
                                               power=1)  # power=1 为幅度谱
        speech_x, clean_phase = divide_magphase(speech_cmplx_x, power=1)

        # Predict.
        pred = model.predict(mixed_x)
        print(cnt, na)
        # Debug plot.
        if args.visualize:
            fig, axs = plt.subplots(3, 1)
            axs[0].matshow(mixed_x.T,
                           origin='lower',
                           aspect='auto',
                           cmap='jet')
            axs[1].matshow(speech_x.T,
                           origin='lower',
                           aspect='auto',
                           cmap='jet')
            axs[2].matshow(pred.T, origin='lower', aspect='auto', cmap='jet')
            axs[0].set_title("%ddb mixture log spectrogram" % int(te_snr))
            axs[1].set_title("Clean speech log spectrogram")
            axs[2].set_title("Enhanced speech log spectrogram")
            for j1 in range(3):
                axs[j1].xaxis.tick_bottom()
            plt.tight_layout()
            plt.show()

        # Recover enhanced wav.
        pred_sp = pred  # np.exp(pred)
        n_window = cfg.n_window
        n_overlap = cfg.n_overlap
        hop_size = n_window - n_overlap
        ham_win = np.sqrt(np.hanning(n_window))
        stft_reconstructed_clean = merge_magphase(pred_sp, mixed_phase)
        stft_reconstructed_clean = stft_reconstructed_clean.T
        signal_reconstructed_clean = librosa.istft(stft_reconstructed_clean,
                                                   hop_length=hop_size,
                                                   window=ham_win)
        signal_reconstructed_clean = signal_reconstructed_clean * 32768
        s = signal_reconstructed_clean.astype('int16')

        # Write out enhanced wav.
        out_path = os.path.join(workspace, "enh_wavs", "test",
                                "%ddb" % int(te_snr), "%s.enh.wav" % na)
        pp_data.create_folder(os.path.dirname(out_path))
        pp_data.write_audio(out_path, s, fs)
Ejemplo n.º 11
0
def inference(args):
    workspace = "workspace"
    n_concat = 11
    iter = 50000
    n_window = 320
    n_overlap = 160
    fs = 16000
    # Load model.
    model_path = os.path.join(workspace, "models", "crn_mixdb",
                              "md_%diters.h5" % iter)
    model = load_model(model_path, custom_objects={'keras': keras})
    # Load test data.
    feat_dir = os.path.join(workspace, "features", "spectrogram", "test",
                            "crn_mixdb")
    #feat_dir = os.path.join(workspace, "features", "spectrogram", "train", "office_mixdb")
    names = os.listdir(feat_dir)
    for (cnt, na) in enumerate(names):
        # Load feature.
        feat_path = os.path.join(feat_dir, na)
        data = cPickle.load(open(feat_path, 'rb'))
        [mixed_cmplx_x, speech_x, noise_x, alpha, na] = data
        mixed_x = np.abs(mixed_cmplx_x)
        # Process data.
        n_pad = (n_concat - 1)
        #mixed_x = pad_with_border(mixed_x, n_pad)
        # Cut input spectrogram to 3D segments with n_concat.
        mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat,
                                          hop=11)  #[100, 7, 257]
        #mixed_x = pad_with_border(mixed_x, n_pad)
        #mixed_x_3d = mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1)
        # Predict.
        w, h, l = mixed_x_3d.shape
        pred = model.predict(mixed_x_3d)
        pred_sp = np.reshape(pred, [w * h, l])
        mixed_cmplx_x = mixed_cmplx_x[:w * h, :]
        #pred_sp = pred[:, -1, :]
        print(cnt, na)
        if False:
            fig, axs = plt.subplots(3, 1, sharex=False)
            axs[0].matshow(mixed_x.T,
                           origin='lower',
                           aspect='auto',
                           cmap='jet')
            axs[1].matshow(speech_x.T,
                           origin='lower',
                           aspect='auto',
                           cmap='jet')
            axs[2].matshow(pred_sp.T,
                           origin='lower',
                           aspect='auto',
                           cmap='jet')
            axs[0].set_title("%ddb mixture log spectrogram" % int(1))
            axs[1].set_title("Clean speech log spectrogram")
            axs[2].set_title("Enhanced speech log spectrogram")
            for j1 in range(3):
                axs[j1].xaxis.tick_bottom()
            plt.tight_layout()
            plt.show()
            # Recover enhanced wav.
        #pred_sp = np.exp(pred)
        #pred_sp = pred
        s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming)
        s *= np.sqrt((np.hamming(n_window)**2
                      ).sum())  # Scaler for compensate the amplitude
        # Write out enhanced wav.
        out_path = os.path.join(workspace, "enh_wavs", "test", "crn_mixdb",
                                "%s.enh.wav" % na)
        pp_data.create_folder(os.path.dirname(out_path))
        pp_data.write_audio(out_path, s, fs)
Ejemplo n.º 12
0
def inference(args):
    """Inference all test data, write out recovered wavs to disk. 
    
    Args:
      workspace: str, path of workspace. 
      tr_snr: float, training SNR. 
      te_snr: float, testing SNR. 
      n_concat: int, number of frames to concatenta, should equal to n_concat 
          in the training stage. 
      iter: int, iteration of model to load. 
      visualize: bool, plot enhanced spectrogram for debug. 
    """
    print(args)
    workspace = args.workspace
    tr_snr = args.tr_snr
    te_snr = args.te_snr
    n_concat = args.n_concat
    iter = args.iteration

    n_window = cfg.n_window
    n_overlap = cfg.n_overlap
    fs = cfg.sample_rate
    scale = True

    # Load model.
    model_path = os.path.join(workspace, "models", "%ddb" % int(tr_snr),
                              "md_%diters.h5" % iter)
    model = load_model(model_path)

    # Load scaler.
    scaler_path = os.path.join(workspace, "packed_features", "spectrogram",
                               "train", "%ddb" % int(tr_snr), "scaler.p")
    scaler = pickle.load(open(scaler_path, 'rb'))

    # Load test data.
    feat_dir = os.path.join(workspace, "features", "spectrogram", "test",
                            "%ddb" % int(te_snr))
    names = os.listdir(feat_dir)

    for (cnt, na) in enumerate(names):
        # Load feature.
        feat_path = os.path.join(feat_dir, na)
        data = cPickle.load(open(feat_path, 'rb'))
        [mixed_cmplx_x, speech_x, noise_x, alpha, na] = data
        mixed_x = np.abs(mixed_cmplx_x)

        # Process data.
        n_pad = (n_concat - 1) / 2
        mixed_x = pp_data.pad_with_border(mixed_x, n_pad)
        mixed_x = pp_data.log_sp(mixed_x)
        speech_x = pp_data.log_sp(speech_x)

        # Scale data.
        if scale:
            mixed_x = pp_data.scale_on_2d(mixed_x, scaler)
            speech_x = pp_data.scale_on_2d(speech_x, scaler)

        # Cut input spectrogram to 3D segments with n_concat.
        mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1)

        # Predict.
        pred = model.predict(mixed_x_3d)
        print(cnt, na)

        # Inverse scale.
        if scale:
            mixed_x = pp_data.inverse_scale_on_2d(mixed_x, scaler)
            speech_x = pp_data.inverse_scale_on_2d(speech_x, scaler)
            pred = pp_data.inverse_scale_on_2d(pred, scaler)

        # Debug plot.
        if args.visualize:
            fig, axs = plt.subplots(3, 1, sharex=False)
            axs[0].matshow(mixed_x.T,
                           origin='lower',
                           aspect='auto',
                           cmap='jet')
            axs[1].matshow(speech_x.T,
                           origin='lower',
                           aspect='auto',
                           cmap='jet')
            axs[2].matshow(pred.T, origin='lower', aspect='auto', cmap='jet')
            axs[0].set_title("%ddb mixture log spectrogram" % int(te_snr))
            axs[1].set_title("Clean speech log spectrogram")
            axs[2].set_title("Enhanced speech log spectrogram")
            for j1 in xrange(3):
                axs[j1].xaxis.tick_bottom()
            plt.tight_layout()
            plt.show()

        # Recover enhanced wav.
        pred_sp = np.exp(pred)
        s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming)
        s *= np.sqrt((np.hamming(n_window)**2
                      ).sum())  # Scaler for compensate the amplitude
        # change after spectrogram and IFFT.

        # Write out enhanced wav.
        out_path = os.path.join(workspace, "enh_wavs", "test",
                                "%ddb" % int(te_snr), "%s.enh.wav" % na)
        pp_data.create_folder(os.path.dirname(out_path))
        pp_data.write_audio(out_path, s, fs)
Ejemplo n.º 13
0
def prepare_database():

    (noise, _) = pp.read_audio(conf2.noise_path)

    with open(os.path.join('dnn2', 'dnn2_files_list.txt')) as f:
        dnn2_data = f.readlines()

    (model1, scaler1) = dnn1.load_dnn()

    # generate train mean values

    snr2_list = []
    mixed_avg = []
    clean_avg = []
    enh_avg = []

    for n in range(conf2.training_number):
        current_file = (random.choice(dnn2_data)).rstrip()
        dist = random.uniform(1, 20)
        (clean, _) = pp.read_audio(current_file)

        mixed, noise_new, clean_new, s2nr = set_microphone_at_distance(
            clean, noise, conf2.fs, dist)

        (_, enh, _) = dnn1.predict_file(current_file, model1, scaler1)

        # s2nr = 1 / (1 + (1 / float(snr)))
        snr2_list.append(s2nr)

        mixed_avg.append(np.mean(mixed))
        clean_avg.append(np.mean(clean_new))
        enh_avg.append(np.mean(enh))

        sr = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(5))
        path_list = current_file.split(os.sep)
        mixed_name = "mix_%s_%s_%s" % (path_list[2], sr,
                                       os.path.basename(current_file))
        clean_name = "clean_%s_%s_%s" % (path_list[2], sr,
                                         os.path.basename(current_file))
        enh_name = "enh_%s_%s_%s" % (path_list[2], sr,
                                     os.path.basename(current_file))

        if n % 10 == 0:
            print(n)

        if conf2.save_single_files and n < conf1.n_files_to_save:

            mixed_path = os.path.join(conf2.train_folder, mixed_name)
            clean_path = os.path.join(conf2.train_folder, clean_name)
            enh_path = os.path.join(conf2.train_folder, enh_name)
            pp.write_audio(mixed_path, mixed, conf2.fs)
            pp.write_audio(clean_path, clean_new, conf2.fs)
            pp.write_audio(enh_path, enh, conf2.fs)

    if len(mixed_avg) != len(enh_avg):
        raise Exception('Number of mixed and enhanced audio must be the same')

    num_tr = len(mixed_avg)

    if os.path.exists(os.path.join(conf2.train_folder, 'train_data.txt')):
        os.remove(os.path.join(conf2.train_folder, 'train_data.txt'))
    f1 = open(os.path.join(conf2.train_folder, 'train_data.txt'), 'w')
    for line1, line2, line3 in zip(mixed_avg, clean_avg, snr2_list):
        f1.write("%s, %s, %s\n" % (line1, line2, line3))

    print(len(mixed_avg), ',', len(enh_avg))

    # generate test spectrograms]

    snr2_list = []
    mixed_avg = []
    clean_avg = []
    enh_avg = []

    for n in range(conf2.test_number):
        current_file = (random.choice(dnn2_data)).rstrip()
        dist = random.uniform(1, 20)
        (clean, _) = pp.read_audio(current_file)

        mixed, noise_new, clean_new, s2nr = set_microphone_at_distance(
            clean, noise, conf2.fs, dist)

        (_, enh, _) = dnn1.predict_file(current_file, model1, scaler1)

        # s2nr = 1 / (1 + (1 / float(snr)))
        snr2_list.append(s2nr)

        mixed_avg.append(np.mean(mixed))
        clean_avg.append(np.mean(clean_new))
        enh_avg.append(np.mean(enh))

        sr = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(5))
        path_list = current_file.split(os.sep)
        mixed_name = "mix_%s_%s_%s" % (path_list[2], sr,
                                       os.path.basename(current_file))
        clean_name = "clean_%s_%s_%s" % (path_list[2], sr,
                                         os.path.basename(current_file))
        enh_name = "enh_%s_%s_%s" % (path_list[2], sr,
                                     os.path.basename(current_file))

        if n % 10 == 0:
            print(n)

        if conf2.save_single_files and n < conf1.n_files_to_save:

            mixed_path = os.path.join(conf2.train_folder, mixed_name)
            clean_path = os.path.join(conf2.train_folder, clean_name)
            enh_path = os.path.join(conf2.train_folder, enh_name)
            pp.write_audio(mixed_path, mixed, conf2.fs)
            pp.write_audio(clean_path, clean_new, conf2.fs)
            pp.write_audio(enh_path, enh, conf2.fs)

    print(len(mixed_avg), ',', len(enh_avg))

    if len(mixed_avg) != len(enh_avg):
        raise Exception('Number of mixed and enhanced audio must be the same')

    num_te = len(mixed_avg)

    if os.path.exists(os.path.join(conf2.test_folder, 'test_data.txt')):
        os.remove(os.path.join(conf2.test_folder, 'test_data.txt'))
    f1 = open(os.path.join(conf2.test_folder, 'test_data.txt'), 'w')
    for line1, line2, line3 in zip(mixed_avg, clean_avg, snr2_list):
        f1.write("%s, %s, %s\n" % (line1, line2, line3))

    return num_tr, num_te
Ejemplo n.º 14
0
def inference(args):
    """Inference all test data, write out recovered wavs to disk. 
    
    Args:
      workspace: str, path of workspace. 
      tr_snr: float, training SNR. 
      te_snr: float, testing SNR. 
      n_concat: int, number of frames to concatenta, should equal to n_concat 
          in the training stage. 
      iter: int, iteration of model to load. 
      visualize: bool, plot enhanced spectrogram for debug. 
    """
    print(args)
    workspace = args.workspace
    tr_snr = args.tr_snr
    te_snr = args.te_snr
    n_concat = args.n_concat
    iter = args.iteration
    calc_log = args.calc_log
    model_file = args.model_file

    n_window = cfg.n_window
    n_overlap = cfg.n_overlap
    fs = cfg.sample_rate
    scale = True

    # Build model
    n_concat = 7
    n_freq = 257
    n_hid = 2048
    lr = 1e-3

    model = Sequential()
    model.add(Flatten(input_shape=(n_concat, n_freq)))
    model.add(Dropout(0.1))
    model.add(Dense(n_hid, activation='relu'))
    model.add(Dense(n_hid, activation='relu'))
    model.add(Dense(n_hid, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(Dense(n_hid, activation='relu'))
    model.add(Dense(n_hid, activation='relu'))
    model.add(Dense(n_hid, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(n_hid, activation='relu'))
    model.add(Dense(n_hid, activation='relu'))
    model.add(Dense(n_hid, activation='relu'))
    model.add(Dropout(0.2))
    if calc_log:
        model.add(Dense(n_freq, activation='linear'))
    else:
        model.add(Dense(n_freq, activation='relu'))
    model.summary()

    model.compile(loss='mean_absolute_error', optimizer=Adam(lr=lr))

    # Load model.
    if (model_file == "null"):
        model_path = os.path.join(workspace, "models", "%ddb" % int(tr_snr),
                                  "md_%diters.h5" % iter)
        #model = load_model(model_path)
        model.load_weights(model_path)
    else:
        model.load_weights(model_file)

    # Load scaler.
    if calc_log:
        scaler_path = os.path.join(workspace, "packed_features", "spectrogram",
                                   "train", "%ddb" % int(tr_snr), "scaler.p")
        scaler = pickle.load(open(scaler_path, 'rb'))

    # Load test data.
    feat_dir = os.path.join(workspace, "features", "spectrogram", "test",
                            "%ddb" % int(te_snr))
    names = os.listdir(feat_dir)

    for (cnt, na) in enumerate(names):
        # Load feature.
        feat_path = os.path.join(feat_dir, na)
        data = cPickle.load(open(feat_path, 'rb'))
        [mixed_cmplx_x, speech_x, noise_x, alpha, na] = data
        mixed_x = np.abs(mixed_cmplx_x)

        # Process data.
        n_pad = (n_concat - 1) / 2
        mixed_x = pp_data.pad_with_border(mixed_x, n_pad)
        if calc_log:
            mixed_x = pp_data.log_sp(mixed_x)
            #speech_x = pp_data.log_sp(speech_x)
        else:
            mixed_x = mixed_x
            #speech_x = speech_x

        # Scale data.
        if calc_log:
            mixed_x = pp_data.scale_on_2d(mixed_x, scaler)
            #speech_x = pp_data.scale_on_2d(speech_x, scaler)
        else:
            mixed_x_max = np.max(mixed_x)
            print("max of tr_x:", mixed_x_max)
            mixed_x = mixed_x / mixed_x_max

            speech_x_max = np.max(speech_x)
            print("max of speech_x:", speech_x_max)
            speech_x = speech_x / speech_x_max

        # Cut input spectrogram to 3D segments with n_concat.
        mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1)

        # Predict.
        if False:
            print(mixed_x_3d)
        pred = model.predict(mixed_x_3d)
        print(cnt, na)
        if False:
            print("pred")
            print(pred)
            print("speech")
            print(speech_x)

        # Inverse scale.
        if calc_log:
            mixed_x = pp_data.inverse_scale_on_2d(mixed_x, scaler)
            #speech_x = pp_data.inverse_scale_on_2d(speech_x, scaler)
            pred = pp_data.inverse_scale_on_2d(pred, scaler)
        else:
            mixed_x = mixed_x * mixed_x_max
            #speech_x = speech_x * 16384
            pred = pred * mixed_x_max

        # Debug plot.
        if args.visualize:
            fig, axs = plt.subplots(3, 1, sharex=False)
            axs[0].matshow(mixed_x.T,
                           origin='lower',
                           aspect='auto',
                           cmap='jet')
            #axs[1].matshow(speech_x.T, origin='lower', aspect='auto', cmap='jet')
            axs[2].matshow(pred.T, origin='lower', aspect='auto', cmap='jet')
            axs[0].set_title("%ddb mixture log spectrogram" % int(te_snr))
            axs[1].set_title("Clean speech log spectrogram")
            axs[2].set_title("Enhanced speech log spectrogram")
            for j1 in xrange(3):
                axs[j1].xaxis.tick_bottom()
            plt.tight_layout()
            plt.show()

        # Recover enhanced wav.
        if calc_log:
            pred_sp = np.exp(pred)
        else:
            #gv = 0.025
            #pred_sp = np.maximum(0,pred - gv)
            pred_sp = pred

        if False:
            pred_sp = mixed_x[3:-3]

        s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming)
        s *= np.sqrt((np.hamming(n_window)**2
                      ).sum())  # Scaler for compensate the amplitude
        # change after spectrogram and IFFT.

        # Write out enhanced wav.
        out_path = os.path.join(workspace, "enh_wavs", "test",
                                "%ddb" % int(te_snr), "%s.enh.wav" % na)
        pp_data.create_folder(os.path.dirname(out_path))
        pp_data.write_audio(out_path, s, fs)
        # Write out enhanced pcm 8K pcm_s16le.
        out_pcm_path = os.path.join(workspace, "enh_wavs", "test",
                                    "%ddb" % int(te_snr), "%s.enh.pcm" % na)
        cmd = ' '.join([
            "./ffmpeg -y -i ", out_path,
            " -f s16le -ar 8000 -ac 1 -acodec pcm_s16le ", out_pcm_path
        ])
        os.system(cmd)

        # Write out webrtc-denoised enhanced pcm 8K pcm_s16le.
        ns_out_pcm_path = os.path.join(workspace, "ns_enh_wavs", "test",
                                       "%ddb" % int(te_snr),
                                       "%s.ns_enh.pcm" % na)
        ns_out_wav_path = os.path.join(workspace, "ns_enh_wavs", "test",
                                       "%ddb" % int(te_snr),
                                       "%s.ns_enh.wav" % na)
        pp_data.create_folder(os.path.dirname(ns_out_pcm_path))
        cmd = ' '.join(["./ns", out_pcm_path, ns_out_pcm_path])
        os.system(cmd)
        cmd = ' '.join([
            "./ffmpeg -y -f s16le -ar 8000 -ac 1 -acodec pcm_s16le -i ",
            ns_out_pcm_path, "  ", ns_out_wav_path
        ])
        os.system(cmd)

        cmd = ' '.join(["rm ", out_pcm_path])
        os.system(cmd)
        cmd = ' '.join(["rm ", ns_out_pcm_path])
        os.system(cmd)
Ejemplo n.º 15
0
def ibm_separation(args):
    """Ideal binary mask (IBM) source separation. 
    """
    workspace = args.workspace

    out_dir = os.path.join(workspace, "separated_wavs", "ibm_separation")
    pp_data.create_folder(out_dir)

    audio_dir = os.path.join(workspace, "mixed_audio", "testing")
    names = os.listdir(audio_dir)

    n_window = cfg.n_window
    n_overlap = cfg.n_overlap
    fs = cfg.sample_rate
    clip_sec = cfg.clip_sec

    ham_win = np.hamming(n_window)
    recover_scaler = np.sqrt((ham_win**2).sum())

    for na in names:
        if '.mix_0db.wav' in na:
            print(na)
            bare_na = os.path.splitext(os.path.splitext(na)[0])[0]
            audio_path = os.path.join(audio_dir, na)
            (bg_audio, event_audio, fs) = pp_data.read_audio_stereo(audio_path)
            mixed_audio = bg_audio + event_audio

            [f, t, bg_spec] = signal.spectral.spectrogram(x=bg_audio,
                                                          window=ham_win,
                                                          nperseg=n_window,
                                                          noverlap=n_overlap,
                                                          detrend=False,
                                                          return_onesided=True,
                                                          scaling='density',
                                                          mode='magnitude')

            [f, t,
             event_spec] = signal.spectral.spectrogram(x=event_audio,
                                                       window=ham_win,
                                                       nperseg=n_window,
                                                       noverlap=n_overlap,
                                                       detrend=False,
                                                       return_onesided=True,
                                                       scaling='density',
                                                       mode='magnitude')

            [f, t,
             mixed_spec] = signal.spectral.spectrogram(x=mixed_audio,
                                                       window=ham_win,
                                                       nperseg=n_window,
                                                       noverlap=n_overlap,
                                                       detrend=False,
                                                       return_onesided=True,
                                                       scaling='density',
                                                       mode='complex')

            bg_spec = bg_spec.T
            event_spec = event_spec.T
            mixed_spec = mixed_spec.T

            ratio = 1.7  # 5 dB
            event_mask = (np.sign(event_spec / (bg_spec * ratio) - 1) + 1) / 2
            bg_mask = 1. - event_mask

            bg_separated_spec = np.abs(mixed_spec) * bg_mask
            event_separated_spec = np.abs(mixed_spec) * event_mask

            # Write out separated music
            s = spectrogram_to_wave.recover_wav(bg_separated_spec,
                                                mixed_spec,
                                                n_overlap=n_overlap,
                                                winfunc=np.hamming,
                                                wav_len=int(fs * clip_sec))
            s *= recover_scaler
            pp_data.write_audio(os.path.join(out_dir, bare_na + ".sep_bg.wav"),
                                s, fs)

            # Write out separated vocal
            s = spectrogram_to_wave.recover_wav(event_separated_spec,
                                                mixed_spec,
                                                n_overlap=n_overlap,
                                                winfunc=np.hamming,
                                                wav_len=int(fs * clip_sec))
            s *= recover_scaler
            pp_data.write_audio(
                os.path.join(out_dir, bare_na + ".sep_event.wav"), s, fs)

    print("Finished!")
Ejemplo n.º 16
0
def dab_run(snr_list, file_name="dab_out", mode='dab'):

    output_file_folder = os.path.join("data_eval", mode)

    # removing previous enhancements
    for file in os.listdir(os.path.join("data_eval", "dnn1_out")):
        file_path = os.path.join("data_eval", "dnn1_out", file)
        os.remove(file_path)

    dnn1_inputs, dnn1_outputs = dnn1.predict_folder(
        os.path.join("data_eval", "dnn1_in"),
        os.path.join("data_eval", "dnn1_out"))

    names = [
        f for f in sorted(os.listdir(os.path.join("data_eval", "dnn1_out")))
        if f.startswith("enh")
    ]
    dnn1_outputs = []
    for (cnt, na) in enumerate(names):
        # Load feature.
        file_path = os.path.join("data_eval", "dnn1_out", na)
        (a, _) = pp.read_audio(file_path)
        enh_complex = pp.calc_sp(a, 'complex')
        dnn1_outputs.append(enh_complex)

    # s2nrs = dnn2.predict("data_eval/dnn1_in", "data_eval/dnn1_out")

    # snr = np.array([5.62, 1.405, 0.703, 0.281])
    # snr = np.array([5.62, 2.81, 1.875, 1.406])
    s2nrs = snr_list * 1
    for i in range(len(snr_list)):
        s2nrs[i] = 1 / (1 + 1 / snr_list[i])

    ch_rw_outputs = []
    # calculate channel weights
    if mode == 'dab':
        new_weights = channel_weights(s2nrs)
        print(new_weights)
        # multiply enhanced audio for the corresponding weight
        for i, p in zip(dnn1_outputs, new_weights):
            ch_rw_outputs.append(p * i)

    # cancel reweighting if db mode
    if mode == 'db':
        new_weights = s2nrs
        print(new_weights)
        ch_rw_outputs = dnn1_outputs

    # execute mvdr
    final = mvdr(dnn1_inputs, ch_rw_outputs)

    (init,
     _) = pp.read_audio(os.path.join('data_eval', 'test_speech', file_name))
    init_sp = pp.calc_sp(init, mode='complex')

    visualize(dnn1_colors(np.abs(init_sp)), dnn1_colors(np.abs(final)),
              "source amplitude", "final amplitude")

    # Recover and save enhanced wav
    pp.create_folder(output_file_folder)
    s = recover_wav_complex(final, conf1.n_overlap, np.hamming)
    s *= np.sqrt((np.hamming(
        conf1.n_window)**2).sum())  # Scaler for compensate the amplitude
    audio_path = os.path.join(output_file_folder, file_name)
    pp.write_audio(audio_path, s, conf1.sample_rate)

    print('%s done' % mode)
Ejemplo n.º 17
0
def inference(args):
    """Inference all test data, write out recovered wavs to disk. 
    
    Args:
      workspace: str, path of workspace. 
      tr_snr: float, training SNR. 
      te_snr: float, testing SNR. 
      n_concat: int, number of frames to concatenta, should equal to n_concat 
          in the training stage. 
      iter: int, iteration of model to load. 
      visualize: bool, plot enhanced spectrogram for debug. 
    """
    print(args)
    workspace = args.workspace
    #tr_snr = args.tr_snr
    #te_snr = args.te_snr
    n_concat = args.n_concat
    #iter = args.iteration
    TF = args.TF
    model_name = args.model_name
    
    n_window = cfg.n_window
    n_overlap = cfg.n_overlap
    #snr = cfg.SNR
    n_hop = int(n_window-n_overlap)
    fs = cfg.sample_rate
    scale = True
    
    # Load model
    t1 = time.time()
    #model_path = os.path.join(workspace, "saved_models", "%s" % model_name, "weights-checkpoint-25-0.41.h5")
    mag_model_root = os.path.join(workspace, "saved_models", "%s" % model_name )
    #model_root = '/home/szuer/CI_DNN/workspace_16kHz/cis_strategy/noise10/mixture/saved_models/0/sdnn1'
    mag_model_files = find_models(mag_model_root)
    epoch_num = []
    for i in range(len(mag_model_files)):
        epoch_num.append(int(mag_model_files[i].split("/")[-1].split('-')[2]))
    mag_model_index = epoch_num.index(max(epoch_num))
    mag_model_path = mag_model_files[mag_model_index]
    print("The selected model path is %s :" % mag_model_path)
    
    mag_model = load_model(mag_model_path)
    
    '''
    # loading phase model
    phase_model_root = os.path.join(workspace, "phase_saved_models", "%s" % model_name )
    #model_root = '/home/szuer/CI_DNN/workspace_16kHz/cis_strategy/noise10/mixture/saved_models/0/sdnn1'
    phase_model_files = find_models(phase_model_root)
    epoch_num1 = []
    for i in range(len(phase_model_files)):
        epoch_num1.append(int(phase_model_files[i].split("/")[-1].split('-')[2]))
    phase_model_index = epoch_num1.index(max(epoch_num1))
    phase_model_path = phase_model_files[phase_model_index]
    print("The selected model path is %s :" % phase_model_path)
    
    phase_model = load_model(phase_model_path)
    '''
    # Load scaler
    mag_scaler_path = os.path.join(workspace, "packed_features", "train", "mag_scaler.p")
    mag_scaler = pickle.load(open(mag_scaler_path, 'rb'))
    
    #phase_scaler_path = os.path.join(workspace, "packed_features", "train", "phase_scaler.p")
    #phase_scaler = pickle.load(open(phase_scaler_path, 'rb'))
    
    # Load test data. 
    feat_dir = os.path.join(workspace, "features", "test")
    names = os.listdir(feat_dir)

    for (cnt, na) in enumerate(names):
        # Load feature. 
        feat_path = os.path.join(feat_dir, na)
        data = cPickle.load(open(feat_path, 'rb'))
        [mixed_cmplx_x, speech_cmplx_x] = data
        n_pad = (n_concat - 1) / 2
        
        if TF == "spectrogram":
            mixed_x = np.abs(mixed_cmplx_x)
            # mixed_phase = np.angle(mixed_cmplx_x)
            # Process data. 
            #n_pad = (n_concat - 1) / 2
            mixed_x = pp_data.pad_with_border(mixed_x, n_pad)
            mixed_x = pp_data.log_sp(mixed_x)
            # mixed_phase = pp_data.pad_with_border(mixed_phase, n_pad)
            
            # speech_x = pp_data.log_sp(np.abs(speech_cmplx_x))
            #speech_phase = np.angle(speech_cmplx_x)

            
        else:
            raise Exception("TF must be spectrogram, timedomain or fftmagnitude!")
            
        # Scale data. 
        if scale:
            mixed_x = pp_data.scale_on_2d(mixed_x, mag_scaler)
            # speech_x = pp_data.scale_on_2d(speech_x, mag_scaler)
            #mixed_phase = pp_data.scale_on_2d(mixed_phase, phase_scaler)
            #speech_phase = pp_data.scale_on_2d(speech_phase, phase_scaler)
        
        # Cut input spectrogram to 3D segments with n_concat. 
        #mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1)
        mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1)
        #mixed_phase_3d = pp_data.mat_2d_to_3d(mixed_phase, agg_num=n_concat, hop=1)
        #print("loading data time: %s s" % (time.time() - t1,))
        '''
        layer_1 = K.function([model.layers[0].input], [model.layers[2].output])#第一个 model.layers[0],不修改,表示输入数据;第二个model.layers[you wanted],修改为你需要输出的层数的编号
        f1 = layer_1([mixed_x_3d])[0]#只修改inpu_image
        #第一层卷积后的特征图展示,输出是(1,149,149,32),(样本个数,特征图尺寸长,特征图尺寸宽,特征图个数)
        for _ in range(12):
            show_img = f1[1, :, :, _]
            show_img.shape = [1, 257]
            plt.subplot(3, 4, _ + 1)
            plt.imshow(show_img.T, cmap='gray')
            plt.axis('off')
        plt.show()
        '''
        # Predict. 
        t2 = time.time()
        mag_pred = mag_model.predict(mixed_x_3d)
        #phase_pred = phase_model.predict(mixed_phase_3d)
        print("model predicts %d utterance : %s successfully" % (cnt, na))
        #print(pred)
        
        # Inverse scale. 
        if scale:
            # mixed_x = pp_data.inverse_scale_on_2d(mixed_x, mag_scaler)
            # speech_x = pp_data.inverse_scale_on_2d(speech_x, mag_scaler)
            mag_pred = pp_data.inverse_scale_on_2d(mag_pred, mag_scaler)
            
            #mixed_phase = pp_data.inverse_scale_on_2d(mixed_phase, phase_scaler)
            #speech_phase = pp_data.inverse_scale_on_2d(speech_phase, phase_scaler)
            #phase_pred = pp_data.inverse_scale_on_2d(phase_pred, phase_scaler)
        
       
                    

        # Recover enhanced wav. 
        #pred_sp = np.exp(pred)
        if TF == "spectrogram":
            pred_sp = (10**(mag_pred/10))-1e-10
            #pred_ph = np.exp(1j * phase_pred)
            '''
            R = np.multiply(pred_sp, pred_ph)
            result = librosa.istft(R.T,
                                   hop_length=n_hop,
                                   win_length=cfg.n_window,
                                   window=scipy.signal.hamming, center=False)
            result /= abs(result).max()
            y_out = result*0.8'''
            #s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming)
            #s *= np.sqrt((np.hamming(n_window)**2).sum())   # Scaler for compensate the amplitude 
            s = spectra_to_wav(pred_sp, mixed_cmplx_x, n_window, n_hop, 'hamming')
            
        # Write out enhanced wav. 
        out_path = os.path.join(workspace, "enh_flipphase", "test", "%s" % model_name, "{}_fft_dnn_map.wav".format(na.split('.')[0]))
        pp_data.create_folder(os.path.dirname(out_path))
        pp_data.write_audio(out_path, s, fs)
        print("predict an utterance time: %s s" % (time.time() - t2,))
        
    print("total test time: %s s" % (time.time() - t1,))    
Ejemplo n.º 18
0
def inference1111(args):
    """Inference all test data, write out recovered wavs to disk. 
    
    Args:
      workspace: str, path of workspace. 
      tr_snr: float, training SNR. 
      te_snr: float, testing SNR. 
      n_concat: int, number of frames to concatenta, should equal to n_concat 
          in the training stage. 
      iter: int, iteration of model to load. 
      visualize: bool, plot enhanced spectrogram for debug. 
    """
    print(args)
    workspace = args.workspace
    #tr_snr = args.tr_snr
    #te_snr = args.te_snr
    n_concat = args.n_concat
    #iter = args.iteration
    TF = args.TF
    model_name = args.model_name
    
    n_window = cfg.n_window
    n_overlap = cfg.n_overlap
    #snr = cfg.SNR
    n_hop = int(n_window-n_overlap)
    fs = cfg.sample_rate
    scale = True
    
    # Load model
    t1 = time.time()
    #model_path = os.path.join(workspace, "saved_models", "%s" % model_name, "weights-checkpoint-25-0.41.h5")
    model_root = os.path.join(workspace, "saved_models", "%s" % model_name )
    #model_root = '/home/szuer/CI_DNN/workspace_16kHz/cis_strategy/noise10/mixture/saved_models/0/sdnn1'
    model_files = find_models(model_root)
    epoch_num = []
    for i in range(len(model_files)):
        epoch_num.append(int(model_files[i].split("/")[-1].split('-')[2]))
    model_index = epoch_num.index(max(epoch_num))
    model_path = model_files[model_index]
    print("The selected model path is %s :" % model_path)
    
    model = load_model(model_path)
    
    # Load scaler
    scaler_path = os.path.join(workspace, "packed_features", "train", "scaler.p")
    scaler = pickle.load(open(scaler_path, 'rb'))
    
    # Load test data. 
    feat_dir = os.path.join(workspace, "features", "test")
    names = os.listdir(feat_dir)

    for (cnt, na) in enumerate(names):
        # Load feature. 
        feat_path = os.path.join(feat_dir, na)
        data = cPickle.load(open(feat_path, 'rb'))
        [mixed_cmplx_x, speech_x, na] = data
        n_pad = (n_concat - 1) / 2
        
        if TF == "spectrogram":
            mixed_x = np.abs(mixed_cmplx_x)
        
            # Process data. 
            #n_pad = (n_concat - 1) / 2
            mixed_x = pp_data.pad_with_border(mixed_x, n_pad)
            mixed_x = pp_data.log_sp(mixed_x)
            speech_x = pp_data.log_sp(speech_x)
            
        elif TF == "timedomain":
            #n_pad = (n_concat - 1) / 2
            mixed_x = pp_data.pad_with_border(mixed_cmplx_x, n_pad)
            
        elif TF == "fftmagnitude":
            #n_pad = (n_concat - 1) / 2
            mixed_x = np.abs(mixed_cmplx_x)
            mixed_x = pp_data.pad_with_border(mixed_x, n_pad)
            
        else:
            raise Exception("TF must be spectrogram, timedomain or fftmagnitude!")
            
        # Scale data. 
        if scale:
            mixed_x = pp_data.scale_on_2d(mixed_x, scaler)
            speech_x = pp_data.scale_on_2d(speech_x, scaler)
        
        # Cut input spectrogram to 3D segments with n_concat. 
        #mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1)
        mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1)
        #print("loading data time: %s s" % (time.time() - t1,))
        '''
        layer_1 = K.function([model.layers[0].input], [model.layers[2].output])#第一个 model.layers[0],不修改,表示输入数据;第二个model.layers[you wanted],修改为你需要输出的层数的编号
        f1 = layer_1([mixed_x_3d])[0]#只修改inpu_image
        #第一层卷积后的特征图展示,输出是(1,149,149,32),(样本个数,特征图尺寸长,特征图尺寸宽,特征图个数)
        for _ in range(12):
            show_img = f1[1, :, :, _]
            show_img.shape = [1, 257]
            plt.subplot(3, 4, _ + 1)
            plt.imshow(show_img.T, cmap='gray')
            plt.axis('off')
        plt.show()
        '''
        # Predict. 
        t2 = time.time()
        pred = model.predict(mixed_x_3d)
        print("model predicts %d utterance : %s successfully" % (cnt, na))
        #print(pred)
        
        # Inverse scale. 
        if scale:
            mixed_x = pp_data.inverse_scale_on_2d(mixed_x, scaler)
            speech_x = pp_data.inverse_scale_on_2d(speech_x, scaler)
            pred = pp_data.inverse_scale_on_2d(pred, scaler)
        
        #(frames, frame_length) = pred.shape
        #print("pred domensions %d and %d : " % (frames, frame_length))
        # Debug plot. 
        if args.visualize:
            if TF == "spectrogram":
                fig, axs = plt.subplots(3,1, sharex=False)
                axs[0].matshow(mixed_x.T, origin='lower', aspect='auto', cmap='jet')
                axs[1].matshow(speech_x.T, origin='lower', aspect='auto', cmap='jet')
                axs[2].matshow(pred.T, origin='lower', aspect='auto', cmap='jet')
                axs[0].set_title("%ddb mixture log spectrogram" % int(te_snr))
                axs[1].set_title("Clean speech log spectrogram")
                axs[2].set_title("Enhanced speech log spectrogram")
                for j1 in xrange(3):
                    axs[j1].xaxis.tick_bottom()
                    plt.tight_layout()
                    plt.savefig('debug_model_spectra.png')
                    plt.show()
            elif TF == "timedomain":
                fig, axs = plt.subplots(3,1, sharex=False)
                axs[0].matshow(mixed_x.T, origin='lower', aspect='auto', cmap='jet')
                axs[1].matshow(speech_x.T, origin='lower', aspect='auto', cmap='jet')
                axs[2].matshow(pred.T, origin='lower', aspect='auto', cmap='jet')
                axs[0].set_title("%ddb mixture time domain" % int(te_snr))
                axs[1].set_title("Clean speech time domain")
                axs[2].set_title("Enhanced speech time domain")
                for j1 in xrange(3):
                    axs[j1].xaxis.tick_bottom()
                    plt.tight_layout()
                    plt.savefig('debug model_time.png')
                    plt.show()
            else:
                raise Exception("TF must be spectrogram or timedomain!")
                    

        # Recover enhanced wav. 
        #pred_sp = np.exp(pred)
        if TF == "spectrogram":
            pred_sp = (10**(pred/20))-1e-10
            #s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming)
            #s *= np.sqrt((np.hamming(n_window)**2).sum())   # Scaler for compensate the amplitude 
            s = spectra_to_wav(pred_sp, mixed_cmplx_x, n_window, n_hop, 'hamming')
                                                        # change after spectrogram and IFFT. 
        elif TF == "timedomain":
            s = time_recover_wav(pred, n_window, n_hop, 'hamming')
            #s *= np.sqrt((np.hamming(n_window)**2).sum())
            
        elif TF == "fftmagnitude":
            #n_pad = (n_concat - 1) / 2
            s = spectra_to_wav(pred, mixed_cmplx_x, n_window, n_hop, 'hamming')
            
        else:
            raise Exception("TF must be spectrogram timedomain or fftmagnitude!")
            
        # Write out enhanced wav. 
        out_path = os.path.join(workspace, "enh_wavs", "test", "%s" % model_name, "%s.wav" % na)
        pp_data.create_folder(os.path.dirname(out_path))
        pp_data.write_audio(out_path, s, fs)
        print("predict an utterance time: %s s" % (time.time() - t2,))
        
    print("total test time: %s s" % (time.time() - t1,))
Ejemplo n.º 19
0
def jsc_separation(args):
    """Joing separation-classification (JSC) source separation. 
    """
    workspace = args.workspace

    scaler_path = os.path.join(workspace, "scalers", "logmel",
                               "training.scaler")
    scaler = pickle.load(open(scaler_path, 'rb'))

    md_path = os.path.join(workspace, "models", "main", args.model_name)
    md = serializations.load(md_path)

    out_dir = os.path.join(workspace, "separated_wavs", "jsc_separation")
    pp_data.create_folder(out_dir)

    observe_nodes = [md.find_layer('seg_masks').output_]
    f_forward = md.get_observe_forward_func(observe_nodes)

    audio_dir = os.path.join(os.path.join(workspace, "mixed_audio", "testing"))
    names = os.listdir(audio_dir)

    n_window = cfg.n_window
    n_overlap = cfg.n_overlap
    fs = cfg.sample_rate
    ham_win = np.hamming(n_window)
    recover_scaler = np.sqrt((ham_win**2).sum())

    melW = librosa.filters.mel(sr=fs,
                               n_fft=n_window,
                               n_mels=64,
                               fmin=0.,
                               fmax=fs / 2)
    inverse_melW = get_inverse_W(melW)

    for na in names:
        if ".mix" in na:
            # Read yaml
            bare_name = os.path.splitext(os.path.splitext(na)[0])[0]
            yaml_path = os.path.join(audio_dir, "%s.yaml" % bare_name)
            with open(yaml_path, 'r') as f:
                data = yaml.load(f)
            event_type = data['event_type']
            print(na, event_type)

            # Read audio
            audio_path = os.path.join(audio_dir, na)
            (bg_audio, event_audio, _) = pp_data.read_audio_stereo(audio_path)
            mixed_audio = bg_audio + event_audio

            # Spectrogram
            [f, t, bg_spec] = signal.spectral.spectrogram(x=bg_audio,
                                                          window=ham_win,
                                                          nperseg=n_window,
                                                          noverlap=n_overlap,
                                                          detrend=False,
                                                          return_onesided=True,
                                                          scaling='density',
                                                          mode='complex')

            [f, t,
             event_spec] = signal.spectral.spectrogram(x=event_audio,
                                                       window=ham_win,
                                                       nperseg=n_window,
                                                       noverlap=n_overlap,
                                                       detrend=False,
                                                       return_onesided=True,
                                                       scaling='density',
                                                       mode='complex')

            [f, t,
             mixed_spec] = signal.spectral.spectrogram(x=mixed_audio,
                                                       window=ham_win,
                                                       nperseg=n_window,
                                                       noverlap=n_overlap,
                                                       detrend=False,
                                                       return_onesided=True,
                                                       scaling='density',
                                                       mode='complex')

            bg_spec = bg_spec.T
            event_spec = event_spec.T
            mixed_spec = mixed_spec.T

            # Log Mel spectrogram
            mixed_x = pp_data.calc_feat(mixed_audio)
            x3d = pp_data.do_scaler_on_x3d(mixed_x[np.newaxis, ...], scaler)

            # Segmentation masks
            [mel_masks] = md.run_function(f_forward,
                                          x3d,
                                          batch_size=10,
                                          tr_phase=0.)
            mel_masks = mel_masks[0]  # (n_time, 64)
            spec_masks = np.dot(mel_masks, inverse_melW)  # (n_time, 513)

            if args.plot_only:
                mixed_mel_spec = np.dot(np.abs(mixed_spec), melW.T)
                bg_mel_spec = np.dot(np.abs(bg_spec), melW.T)
                event_mel_spec = np.dot(np.abs(event_spec), melW.T)
                ratio = 1.7  # 5 dB
                event_mask = (np.sign(event_mel_spec /
                                      (bg_mel_spec * ratio) - 1) + 1) / 2

                fig, axs = plt.subplots(3, 2, sharex=True)
                axs[0, 0].matshow(np.log(mixed_mel_spec.T),
                                  origin='lower',
                                  aspect='auto')
                axs[0, 1].matshow(event_mask.T, origin='lower', aspect='auto')
                axs[1, 0].matshow(spec_masks[0].T,
                                  origin='lower',
                                  aspect='auto',
                                  vmin=0.,
                                  vmax=1.)
                axs[1, 1].matshow(spec_masks[1].T,
                                  origin='lower',
                                  aspect='auto',
                                  vmin=0.,
                                  vmax=1.)
                axs[2, 0].matshow(spec_masks[2].T,
                                  origin='lower',
                                  aspect='auto',
                                  vmin=0.,
                                  vmax=1.)
                axs[2, 1].matshow(spec_masks[3].T,
                                  origin='lower',
                                  aspect='auto',
                                  vmin=0.,
                                  vmax=1.)
                axs[0, 0].set_title('log Mel of mixture')
                axs[0, 1].set_title('IBM of event')
                axs[1, 0].set_title('babycry')
                axs[1, 1].set_title('glassbreak')
                axs[2, 0].set_title('gunshot')
                axs[2, 1].set_title('bg')

                plt.show()

            else:
                # Separated spec
                separated_specs = spec_masks * np.abs(mixed_spec)[None, :, :]

                # Write out all events and bg
                enlarged_events = cfg.events + ['bg']
                for i1 in xrange(4):
                    s = spectrogram_to_wave.recover_wav(
                        separated_specs[i1],
                        mixed_spec,
                        n_overlap=n_overlap,
                        winfunc=np.hamming,
                        wav_len=len(mixed_audio))
                    s *= recover_scaler
                    pp_data.write_audio(
                        os.path.join(
                            out_dir, "%s.sep_%s.wav" %
                            (bare_name, enlarged_events[i1])), s, fs)

                # Write out event
                s = spectrogram_to_wave.recover_wav(
                    separated_specs[cfg.lb_to_ix[event_type]],
                    mixed_spec,
                    n_overlap=n_overlap,
                    winfunc=np.hamming,
                    wav_len=len(mixed_audio))
                s *= recover_scaler
                pp_data.write_audio(
                    os.path.join(out_dir, "%s.sep_event.wav" % bare_name), s,
                    fs)

                # Write out origin mix
                pp_data.write_audio(
                    os.path.join(out_dir, "%s.sep_mix.wav" % bare_name),
                    mixed_audio, fs)
Ejemplo n.º 20
0
def DB_generate(source_audio, out_folder, name):

    #source_audio = pra.normalize(source_audio, bits=16)

    mic_distance = random.randint(
        1, 20)  # mean distance from source to microphones
    source_position = np.array([
        random.uniform(0, room_dimensions[0]),
        random.uniform(0, room_dimensions[1])
    ])

    # random way: guess array center until it's in the room: very long time for small rooms
    mic_in_room = False
    while mic_in_room == False:
        theta = random.uniform(0, 2 * math.pi)
        mic_center = source_position - mic_distance * np.array(
            [math.cos(theta), math.sin(theta)])
        print(mic_center)
        if (0 <= mic_center[0] <= room_dimensions[0]) and (
                0 <= mic_center[1] <= room_dimensions[1]):
            mic_in_room = True

    # number of lateral microphones
    M = 4
    # counterclockwise rotation of array:
    phi = 0
    # distance between microphones
    d = 0.4

    mic_pos = pra.beamforming.linear_2D_array(mic_center, M, phi, d)
    mic_pos = np.concatenate((mic_pos, np.array(mic_center, ndmin=2).T),
                             axis=1)

    distances = []
    for m in range(M):
        d = math.sqrt((source_position[0] - mic_pos[0, m])**2 +
                      (source_position[1] - mic_pos[1, m])**2)
        distances.append(d)

    # create room
    shoebox = pra.ShoeBox(
        room_dimensions,
        absorption=wall_absorption,
        fs=fs,
        max_order=15,
    )

    # shoebox.mic_array.to_wav(os.path.join(out_folder + '_DB', 'mix_' + name), norm=True, bitdepth=np.int16)

    Lg_t = 0.100  # filter size in seconds
    Lg = np.ceil(Lg_t * fs)  # in samples
    fft_len = 512

    mics = pra.Beamformer(mic_pos, shoebox.fs, N=fft_len, Lg=Lg)

    shoebox.add_source(source_position, signal=source_audio)
    shoebox.add_microphone_array(mics)
    shoebox.compute_rir()
    shoebox.simulate()

    # ADDING NOISE

    for n in range(M + 1):
        signal = np.asarray(shoebox.mic_array.signals[n, :], dtype=float)
        signal = pra.utilities.normalize(signal, bits=16)

        mixed_signal = add_noise(source_audio, signal)

        mixed_signal = np.array(mixed_signal, dtype=np.int16)

        mixed_file = os.path.join(out_folder, 'mix%d_%s' % (n, name))
        pp.write_audio(mixed_file, mixed_signal, fs)
Ejemplo n.º 21
0
def inference(args):
    """Inference all test data, write out recovered wavs to disk. 
    
    Args:
      workspace: str, path of workspace. 
      tr_snr: float, training SNR. 
      te_snr: float, testing SNR. 
      n_concat: int, number of frames to concatenta, should equal to n_concat 
          in the training stage. 
      iter: int, iteration of model to load. 
      visualize: bool, plot enhanced spectrogram for debug. 
    """
    print(args)
    workspace = args.workspace
    tr_snr = args.tr_snr
    te_snr = args.te_snr
    n_concat = args.n_concat
    iter = args.iteration
    data_type = 'IRM'

    n_window = cfg.n_window
    n_overlap = cfg.n_overlap
    fs = cfg.sample_rate
    scale = True

    # Load model.
    if data_type == "DM":
        model_path = os.path.join(workspace, "models", "mixdb",
                                  "md_%diters.h5" % 120000)
    else:
        model_path = os.path.join(workspace, "models", "mask_mixdb",
                                  "md_%diters.h5" % 265000)
    model = load_model(model_path)

    # Load scaler.
    scaler_path = os.path.join(workspace, "packed_features", "spectrogram",
                               "train", "mixdb", "scaler.p")
    scaler = pickle.load(open(scaler_path, 'rb'))

    # Load test data.
    feat_dir = os.path.join(workspace, "features", "spectrogram", "test",
                            "mixdb")
    names = os.listdir(feat_dir)

    for (cnt, na) in enumerate(names):
        # Load feature.
        feat_path = os.path.join(feat_dir, na)
        data = cPickle.load(open(feat_path, 'rb'))
        [mixed_cmplx_x, speech_x, noise_x, alpha, na] = data
        mixed_x = np.abs(mixed_cmplx_x)
        if data_type == "IRM":
            mixed_x = speech_x + noise_x
            mixed_x1 = speech_x + noise_x
        # Process data.
        n_pad = (n_concat - 1) / 2
        mixed_x = pp_data.pad_with_border(mixed_x, n_pad)
        mixed_x = pp_data.log_sp(mixed_x)

        # Scale data.
        if scale:
            mixed_x = pp_data.scale_on_2d(mixed_x, scaler)

        # Cut input spectrogram to 3D segments with n_concat.
        mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1)

        # Predict.
        pred = model.predict(mixed_x_3d)
        if data_type == "IRM":
            pred_sp = pred * mixed_x1
        print(cnt, na)

        # Inverse scale.
        if data_type == "DM":
            pred = pp_data.inverse_scale_on_2d(pred, scaler)
            pred_sp = np.exp(pred)
        # Debug plot.
        # Recover enhanced wav.
        s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming)
        s *= np.sqrt((np.hamming(n_window)**2
                      ).sum())  # Scaler for compensate the amplitude
        # change after spectrogram and IFFT.
        # Write out enhanced wav.
        if data_type == "DM":
            out_path = os.path.join(workspace, "enh_wavs", "test", "mixdb",
                                    "%s.enh.wav" % na)
        else:
            out_path = os.path.join(workspace, "enh_wavs", "test",
                                    "mask_mixdb", "%s.enh.wav" % na)
        pp_data.create_folder(os.path.dirname(out_path))
        pp_data.write_audio(out_path, s, fs)
Ejemplo n.º 22
0
def predict_folder(input_file_folder: object, output_file_folder: object) -> object:
    # Load model.
    data_type = "test"
    model_path = os.path.join(conf1.model_dir, "md_%diters.h5" % conf1.iterations)
    model = load_model(model_path)

    # Load scaler.
    # if scale:
    scaler_path = os.path.join(conf1.packed_feature_dir, data_type, "scaler.p")
    scaler = pickle.load(open(scaler_path, 'rb'))

    # Load test data.
    # names = os.listdir(input_file_folder)

    names = [f for f in sorted(os.listdir(input_file_folder)) if f.startswith("mix")]

    mixed_all = []
    pred_all = []
    for (cnt, na) in enumerate(names):
        # Load feature.
        file_path = os.path.join(input_file_folder, na)
        (a, _) = pp.read_audio(file_path)
        mixed_complex = pp.calc_sp(a, 'complex')


        mixed_x = np.abs(mixed_complex)

        # Process data.
        n_pad = (conf1.n_concat - 1) / 2
        mixed_x = pp.pad_with_border(mixed_x, n_pad)
        mixed_x = pp.log_sp(mixed_x)
        # speech_x = dnn1_train.log_sp(speech_x)

        # Scale data.
        # if scale:
        mixed_x = pp.scale_on_2d(mixed_x, scaler)

        # Cut input spectrogram to 3D segments with n_concat.
        mixed_x_3d = pp.mat_2d_to_3d(mixed_x, agg_num=conf1.n_concat, hop=1)


        # Predict.
        pred = model.predict(mixed_x_3d)
        print(cnt, na)

        # Inverse scale.
        #if scale:
        mixed_x = pp.inverse_scale_on_2d(mixed_x, scaler)
        # speech_x = dnn1_train.inverse_scale_on_2d(speech_x, scaler)
        pred = pp.inverse_scale_on_2d(pred, scaler)

        # Debug plot.
        if visualize_plot:
            visualize(mixed_x, pred)

        mixed_all.append(mixed_complex)
        pred_all.append(real_to_complex(pred, mixed_complex))


        # Recover enhanced wav.
        pred_sp = np.exp(pred)
        s = recover_wav(pred_sp, mixed_complex, conf1.n_overlap, np.hamming)
        s *= np.sqrt((np.hamming(conf1.n_window) ** 2).sum())  # Scaler for compensate the amplitude
        # change after spectrogram and IFFT.

        # Write out enhanced wav.

        pp.create_folder(output_file_folder)
        audio_path = os.path.join(output_file_folder, "enh_%s" % na)
        pp.write_audio(audio_path, s, conf1.sample_rate)

    return mixed_all, pred_all
Ejemplo n.º 23
0
def prepare_database():

    (noise, _) = pp.read_audio(conf1.noise_path)

    with open('dnn1/dnn1_files_list.txt') as f:
        dnn1_data = f.readlines()

    # generate train spectrograms
    mixed_all = []
    clean_all = []

    snr1_list = []
    mixed_avg = []

    for n in range(conf1.training_number):
        current_file = (random.choice(dnn1_data)).rstrip()
        dist = random.uniform(1, 20)
        (clean, _) = pp.read_audio(current_file)

        mixed, noise_new, clean_new, snr = set_microphone_at_distance(
            clean, noise, conf1.fs, dist)

        snr1_list.append(snr)
        mixed_avg.append(np.mean(mixed))

        if n % 10 == 0:
            print(n)

        if conf1.save_single_files and n < conf1.n_files_to_save:

            sr = ''.join(
                random.choice(string.ascii_uppercase + string.digits)
                for _ in range(5))

            path_list = current_file.split(os.sep)
            mixed_name = "mix_%s_%s_%s" % (path_list[2], sr,
                                           os.path.basename(current_file))
            clean_name = "clean_%s_%s_%s" % (path_list[2], sr,
                                             os.path.basename(current_file))
            path_list = current_file.split(os.sep)

            mixed_path = os.path.join(conf1.train_folder, mixed_name)
            clean_path = os.path.join(conf1.train_folder, clean_name)

            pp.write_audio(mixed_path, mixed, conf1.fs)
            pp.write_audio(clean_path, clean_new, conf1.fs)

        clean_spec = pp.calc_sp(clean_new, mode='magnitude')
        mixed_spec = pp.calc_sp(mixed, mode='complex')

        clean_all.append(clean_spec)
        mixed_all.append(mixed_spec)

    print(len(clean_all), ',', len(mixed_all))
    num_tr = pp.pack_features(mixed_all, clean_all, 'train')

    compute_scaler('train')

    # generate test spectrograms
    mixed_all = []
    clean_all = []

    snr1_list = []
    mixed_avg = []

    for n in range(conf1.test_number):
        current_file = (random.choice(dnn1_data)).rstrip()
        dist = random.uniform(1, 20)
        (clean, _) = pp.read_audio(current_file)

        mixed, noise_new, clean_new, snr = set_microphone_at_distance(
            clean, noise, conf1.fs, dist)

        snr1_list.append(snr)
        mixed_avg.append(np.mean(mixed))

        if n % 10 == 0:
            print(n)

        if conf1.save_single_files and n < conf1.n_files_to_save:

            sr = ''.join(
                random.choice(string.ascii_uppercase + string.digits)
                for _ in range(5))

            path_list = current_file.split(os.sep)
            mixed_name = "mix_%s_%s_%s" % (path_list[2], sr,
                                           os.path.basename(current_file))
            clean_name = "clean_%s_%s_%s" % (path_list[2], sr,
                                             os.path.basename(current_file))

            mixed_path = os.path.join(conf1.test_folder, mixed_name)
            clean_path = os.path.join(conf1.test_folder, clean_name)

            pp.write_audio(mixed_path, mixed, conf1.fs)
            pp.write_audio(clean_path, clean_new, conf1.fs)

        clean_spec = pp.calc_sp(clean_new, mode='magnitude')
        mixed_spec = pp.calc_sp(mixed, mode='complex')

        clean_all.append(clean_spec)
        mixed_all.append(mixed_spec)

    print(len(clean_all), ',', len(mixed_all))

    num_te = pp.pack_features(mixed_all, clean_all, 'test')

    compute_scaler('test')

    return num_tr, num_te,
def decode():
    """Decoding the inputs using current model."""
    tf.logging.info("Get TEST sets number.")
    num_batch = get_num_batch(FLAGS.test_list_file, infer=True)
    with tf.Graph().as_default():
        with tf.device('/cpu:0'):
            with tf.name_scope('input'):
                data_list = read_list(FLAGS.test_list_file)
                test_utt_id, test_inputs, _ = get_batch(
                    data_list,
                    batch_size=1,
                    input_size=FLAGS.input_dim,
                    output_size=FLAGS.output_dim,
                    left=FLAGS.left_context,
                    right=FLAGS.right_context,
                    num_enqueuing_threads=FLAGS.num_threads,
                    num_epochs=1,
                    infer=True)
                # test_inputs = tf.squeeze(test_inputs, axis=[0])
        devices = []
        for i in xrange(FLAGS.num_gpu):
            device_name = ("/gpu:%d" % i)
            print('Using device: ', device_name)
            devices.append(device_name)

        # Prevent exhausting all the gpu memories.
        config = tf.ConfigProto()
        config.gpu_options.per_process_gpu_memory_fraction = 0.4
        #config.gpu_options.allow_growth = True
        config.allow_soft_placement = True
        set_session(tf.Session(config=config))
        # execute the session
        with tf.Session(config=config) as sess:
            # Create two models with tr_inputs and cv_inputs individually.
            with tf.name_scope('model'):
                model = DNNTrainer(sess,
                                   FLAGS,
                                   devices,
                                   test_inputs,
                                   labels=None,
                                   cross_validation=True)

            show_all_variables()

            init = tf.group(tf.global_variables_initializer(),
                            tf.local_variables_initializer())
            print("Initializing variables ...")
            sess.run(init)

            if model.load(model.save_dir, moving_average=False):
                print("[*] Load Moving Average model SUCCESS")
            else:
                print("[!] Load failed. Checkpoint not found. Exit now.")
                sys.exit(1)

            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(sess=sess, coord=coord)

            cmvn_filename = os.path.join(FLAGS.data_dir, "train_cmvn.npz")
            if os.path.isfile(cmvn_filename):
                cmvn = np.load(cmvn_filename)
            else:
                tf.logging.fatal("%s not exist, exit now." % cmvn_filename)
                sys.exit(1)
            out_dir_name = os.path.join('/Work18/2017/linan/SE/my_enh',
                                        FLAGS.save_dir, FLAGS.savetestdir)
            # out_dir_name = os.path.join(FLAGS.save_dir, 'test')
            if not os.path.exists(out_dir_name):
                os.makedirs(out_dir_name)

            write_scp_path = os.path.join(out_dir_name, 'feats.scp')
            write_ark_path = os.path.join(out_dir_name, 'feats.ark')
            writer = ArkWriter(write_scp_path)

            outputs = model.generator(test_inputs, None, reuse=True)
            outputs = tf.reshape(outputs, [-1, model.output_dim])
            print('shape is', np.shape(outputs))
            try:
                for batch in range(num_batch):
                    if coord.should_stop():
                        break
                    # outputs = model.generator(test_inputs, None, reuse=True)
                    # outputs = tf.reshape(outputs, [-1, model.output_dim])
                    utt_id, activations = sess.run([test_utt_id, outputs])
                    # sequence = activations * cmvn['stddev_labels'] + \
                    # cmvn['mean_labels']
                    sequence = activations
                    save_result = np.vstack(sequence)
                    dir_load = FLAGS.savetestdir
                    dir_load = dir_load.split('/')[-1]
                    mode = FLAGS.mode
                    if mode == 'use_org':
                        inputs_path = os.path.join(
                            'workspace/features/spectrogram/test', dir_load,
                            '%s.wav.p' % utt_id[0])
                        data = cPickle.load(open(inputs_path, 'rb'))
                        [mixed_complx_x] = data
                        #tf.logging.info("Write inferred %s to %s" %(utt_id[0], np.shape(save_result)))
                        save_result = np.exp(save_result)
                        n_window = cfg.n_window
                        s = recover_wav(save_result, mixed_complx_x,
                                        cfg.n_overlap, np.hamming)
                        s *= np.sqrt(
                            (np.hamming(n_window)**2
                             ).sum())  # Scaler for compensate the amplitude
                        # change after spectrogram and IFFT.
                        print("start enhance wav file")
                        # Write out enhanced wav.
                        out_path = os.path.join("workspace", "enh_wavs",
                                                "test", dir_load,
                                                "%s.enh.wav" % utt_id[0])
                        print("have enhanced all  the wav")
                        pp_data.create_folder(os.path.dirname(out_path))
                        pp_data.write_audio(out_path, s, 16000)
                    elif mode == 'g_l':
                        inputs_path = os.path.join(
                            'workspace/features/spectrogram/test', dir_load,
                            '%s.wav.p' % utt_id[0])
                        data = cPickle.load(open(inputs_path, 'rb'))
                        [mixed_complx_x] = data
                        save_result = np.exp(save_result)
                        s = save_result
                        s = audio_utilities.reconstruct_signal_griffin_lim(
                            s, mixed_complx_x, 512, 256, 15)
                        #s = recover_wav(save_result,mixed_complx_x,cfg.n_overlap, np.hamming)
                        s *= np.sqrt((np.hamming(cfg.n_window)**2).sum())
                        #s = audio._griffin_lim(s)
                        out_path = os.path.join("workspace", "enh_wavs",
                                                "test2", dir_load,
                                                "%s.enh.wav" % utt_id[0])
                        pp_data.create_folder(os.path.dirname(out_path))
                        pp_data.write_audio(out_path, s, 16000)
                        tf.logging.info("Write inferred%s" % (np.shape(s)))
                    #writer.write_next_utt(write_ark_path, utt_id[0], save_result)
                    tf.logging.info("Write inferred %s to %s" %
                                    (utt_id[0], out_path))

            except Exception, e:
                # Report exceptions to the coordinator.
                coord.request_stop(e)
            finally:
Ejemplo n.º 25
0
def inference(args):
    """Inference all test data, write out recovered wavs to disk. 
    
    Args:
      workspace: str, path of workspace. 
      tr_snr: float, training SNR. 
      te_snr: float, testing SNR. 
      n_concat: int, number of frames to concatenta, should equal to n_concat 
          in the training stage. 
      iter: int, iteration of model to load. 
      visualize: bool, plot enhanced spectrogram for debug. 
    """
    print(args)
    workspace = args.workspace
    tr_snr = args.tr_snr
    te_snr = args.te_snr
    n_concat = args.n_concat
    iter = args.iteration
    n_noise_frame = args.noise_frame
    n_hop = args.n_hop

    n_window = cfg.n_window
    n_overlap = cfg.n_overlap
    fs = cfg.sample_rate
    scale = False
    # Load model.
    model_path = os.path.join(workspace, "models", "%ddb" % int(tr_snr),
                              "md_%diters.h5" % iter)
    model = load_model(model_path)

    # Load scaler.
    # scaler_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "%ddb" % int(tr_snr), "scaler.p")
    # scaler = pickle.load(open(scaler_path, 'rb'))

    # Load test data.
    feat_dir = os.path.join(workspace, "features", "spectrogram", "test",
                            "%ddb" % int(te_snr))
    names = os.listdir(feat_dir)
    mel_basis = librosa.filters.mel(cfg.sample_rate, cfg.n_window, n_mels=40)
    for (cnt, na) in enumerate(names):
        # Load feature.
        feat_path = os.path.join(feat_dir, na)
        data = cPickle.load(open(feat_path, 'rb'))
        [mixed_cmplx_x, speech_x, noise_x, alpha, na] = data
        input1_3d, input2, out1, out2 = pp_data.get_input_output_layer(
            mixed_cmplx_x, speech_x, noise_x, alpha, n_concat, n_noise_frame,
            n_hop, mel_basis)

        # Predict.
        pred = model.predict([input1_3d, input2])
        print(cnt, na)
        sys.stdout.flush()

        # Inverse scale.
        if scale:
            mixed_x = pp_data.inverse_scale_on_2d(mixed_x, scaler)
            speech_x = pp_data.inverse_scale_on_2d(speech_x, scaler)
            pred = pp_data.inverse_scale_on_2d(pred, scaler)

        # post processing
        pred_speech_lps = 1 / 3.0 * (pred[0][:, :257] + pred[1][:, :257] +
                                     np.log(np.abs(mixed_cmplx_x) + 1e-08) +
                                     np.log(pred[1][:, 327:584]))

        # Debug plot.
        if args.visualize:
            out_path = os.path.join(workspace, "figures", "test",
                                    "%ddb" % int(te_snr), "%s.all.png" % na)
            pp_data.create_folder(os.path.dirname(out_path))
            fig, axs = plt.subplots(3, 1, sharex=False)
            axs[0].matshow(np.log(np.abs(mixed_cmplx_x.T) + 1e-08),
                           origin='lower',
                           aspect='auto',
                           cmap='jet')
            axs[1].matshow(np.log(speech_x.T + 1e-08),
                           origin='lower',
                           aspect='auto',
                           cmap='jet')
            axs[2].matshow(pred_speech_lps.T,
                           origin='lower',
                           aspect='auto',
                           cmap='jet')
            axs[0].set_title("%ddb mixture log spectrogram" % int(te_snr))
            axs[1].set_title("Clean speech log spectrogram")
            axs[2].set_title("Enhanced speech log spectrogram")
            for j1 in xrange(3):
                axs[j1].xaxis.tick_bottom()
            plt.tight_layout()
            plt.savefig(out_path)
            plt.close('all')
            # plt.show()
            out_path = os.path.join(workspace, "figures", "test",
                                    "%ddb" % int(te_snr),
                                    "%s.mixture.png" % na)
            display.specshow(np.log(np.abs(mixed_cmplx_x.T) + 1e-08))
            plt.title("%ddb mixture log spectrogram" % int(te_snr))
            plt.savefig(out_path)
            out_path = os.path.join(workspace, "figures", "test",
                                    "%ddb" % int(te_snr), "%s.clean.png" % na)
            display.specshow(np.log(speech_x.T + 1e-08))
            plt.title("Clean speech log spectrogram")
            plt.savefig(out_path)
            out_path = os.path.join(workspace, "figures", "test",
                                    "%ddb" % int(te_snr), "%s.enh.png" % na)
            display.specshow(pred_speech_lps.T)
            plt.title("Enhanced speech log spectrogram")
            plt.savefig(out_path)
            plt.close('all')

        # Recover enhanced wav.
        pred_sp = np.exp(pred_speech_lps)
        s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming)
        s *= np.sqrt((np.hamming(n_window)**2
                      ).sum())  # Scaler for compensate the amplitude
        # change after spectrogram and IFFT.

        # Write out enhanced wav.
        out_path = os.path.join(workspace, "enh_wavs", "test",
                                "%ddb" % int(te_snr), "%s.enh.wav" % na)
        pp_data.create_folder(os.path.dirname(out_path))
        pp_data.write_audio(out_path, s, fs)
Ejemplo n.º 26
0
def inference(workspace,
              tr_snr,
              te_snr,
              n_concat,
              iteration,
              model_name=None,
              visualize=False,
              force=False):
    """Inference all test data, write out recovered wavs to disk.

    Args:
      workspace: str, path of workspace.
      tr_snr: float, training SNR.
      te_snr: float, testing SNR.
      n_concat: int, number of frames to concatenta, should equal to n_concat
          in the training stage.
      iter: int, iteration of model to load.
      visualize: bool, plot enhanced spectrogram for debug.
    """

    n_window = cfg.n_window
    n_overlap = cfg.n_overlap
    fs = cfg.sample_rate
    scale = True

    if model_name is None:
        model_name = '_'.join([str(snr) for snr in tr_snr]) + 'ddbs'

    # Load model.
    model_path = os.path.join(workspace, "models", model_name,
                              "md_%diters.h5" % iteration)
    print('GPU available: ', tf.test.is_gpu_available())

    model = load_model(model_path)

    # Load scaler.
    scaler = read_combined_scaler(workspace, tr_snr)

    for snr in te_snr:
        # Load test data.
        feat_dir = os.path.join(workspace, "features", "spectrogram", "test",
                                "%ddb" % int(snr))
        feat_paths = all_file_paths(feat_dir)

        for (cnt, feat_path) in tqdm(enumerate(feat_paths),
                                     'Inference (creating enhanced speech)'):
            # Check if the enhanced audio is already inferred
            na = str(PurePath(feat_path).relative_to(feat_dir).with_suffix(''))
            out_path = os.path.join(workspace, "enh_wavs", "test", model_name,
                                    "%ddb" % int(snr), "%s.enh.wav" % na)
            if os.path.isfile(out_path) and not force:
                print(f'Enhanced audio {out_path} is already made')
                continue

            # Load feature.
            data = pickle.load(open(feat_path, 'rb'))
            [mixed_cmplx_x, speech_x, noise_x, ir_mask, alpha, na] = data
            mixed_x = np.abs(mixed_cmplx_x)

            # Process data.
            n_pad = (n_concat - 1) / 2
            mixed_x = pp_data.pad_with_border(mixed_x, n_pad)
            mixed_x = pp_data.log_sp(mixed_x)
            speech_x = pp_data.log_sp(speech_x)

            # Scale data.
            if scale:
                mixed_x = pp_data.scale_on_2d(mixed_x, scaler)
                speech_x = pp_data.scale_on_2d(speech_x, scaler)

            # Cut input spectrogram to 3D segments with n_concat.
            mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1)

            # Predict.
            pred = model.predict(mixed_x_3d)
            #print(cnt, na)

            # Inverse scale.
            if scale:
                mixed_x = pp_data.inverse_scale_on_2d(mixed_x, scaler)
                speech_x = pp_data.inverse_scale_on_2d(speech_x, scaler)
                #pred = pp_data.inverse_scale_on_2d(pred, scaler)

            # Debug plot.
            if visualize:
                fig, axs = plt.subplots(3, 1, sharex=False)
                axs[0].matshow(mixed_x.T,
                               origin='lower',
                               aspect='auto',
                               cmap='jet')
                axs[1].matshow(speech_x.T,
                               origin='lower',
                               aspect='auto',
                               cmap='jet')
                axs[2].matshow(pred.T,
                               origin='lower',
                               aspect='auto',
                               cmap='jet')
                axs[0].set_title("%ddb mixture log spectrogram" % int(te_snr))
                axs[1].set_title("Clean speech log spectrogram")
                axs[2].set_title("Enhanced speech log spectrogram")
                for j1 in xrange(3):
                    axs[j1].xaxis.tick_bottom()
                plt.tight_layout()
                plt.show()

            # Recover enhanced wav
            s = recover_wav(pred,
                            mixed_cmplx_x,
                            n_overlap,
                            np.hamming,
                            irr_mask=True)
            s *= np.sqrt((np.hamming(n_window)**2
                          ).sum())  # Scaler for compensate the amplitude
            # change after spectrogram and IFFT.

            # Write out enhanced wav.
            pp_data.create_folder(os.path.dirname(out_path))
            pp_data.write_audio(out_path, s, fs)