コード例 #1
0
def predict_file(file_path, model, scaler):

    (a, _) = pp.read_audio(file_path)
    mixed_complex = pp.calc_sp(a, 'complex')

    mixed_x = np.abs(mixed_complex)

    # Process data.
    n_pad = (conf1.n_concat - 1) / 2
    mixed_x = pp.pad_with_border(mixed_x, n_pad)
    mixed_x = pp.log_sp(mixed_x)
    # speech_x = dnn1_train.log_sp(speech_x)


    # Scale data.
    # if scale:
    mixed_x = pp.scale_on_2d(mixed_x, scaler)
    # speech_x = pp.scale_on_2d(speech_x, scaler)

    # Cut input spectrogram to 3D segments with n_concat.
    mixed_x_3d = pp.mat_2d_to_3d(mixed_x, agg_num=conf1.n_concat, hop=1)

    # Predict.
    pred = model.predict(mixed_x_3d)

    if visualize_plot:
        visualize(mixed_x, pred)
    # Inverse scale.
    # if scale:
    mixed_x = pp.inverse_scale_on_2d(mixed_x, scaler)
    # speech_x = dnn1_train.inverse_scale_on_2d(speech_x, scaler)
    pred = pp.inverse_scale_on_2d(pred, scaler)


    # Debug plot.

    # Recover enhanced wav.
    pred_sp = np.exp(pred)
    s = recover_wav(pred_sp, mixed_complex, conf1.n_overlap, np.hamming)
    s *= np.sqrt((np.hamming(conf1.n_window) ** 2).sum())  # Scaler for compensate the amplitude
    # change after spectrogram and IFFT.

    # Write out enhanced wav.

    # audio_path = os.path.dirname(file_path)
    # pp.write_audio(audio_path, s, conf1.sample_rate)

    return mixed_complex, pred, s
コード例 #2
0
def inference_wiener(args):
    workspace = args.workspace
    iter = args.iteration
    stack_num = args.stack_num
    filename = args.filename
    mini_num = args.mini_num
    visualize = args.visualize
    cuda = args.use_cuda and torch.cuda.is_available()
    print("cuda:", cuda)

    sample_rate = cfg.sample_rate
    fft_size = cfg.fft_size
    hop_size = cfg.hop_size
    window_type = cfg.window_type

    if window_type == 'hamming':
        window = np.hamming(fft_size)

    # Audio
    audio_dir = "/vol/vssp/msos/qk/workspaces/speech_enhancement/mixed_audios/spectrogram/test/0db"
    # audio_dir = "/user/HS229/qk00006/my_code2015.5-/python/pub_speech_enhancement/mixture2clean_dnn/workspace/mixed_audios/spectrogram/test/0db"
    names = os.listdir(audio_dir)

    # Load model.
    target_type = ['speech', 'noise']
    model_dict = {}
    for e in target_type:
        n_freq = 257
        model = DNN(stack_num, n_freq)
        model_path = os.path.join(workspace, "models", filename, e,
                                  "md_%d_iters.tar" % iter)
        checkpoint = torch.load(model_path)
        model.load_state_dict(checkpoint['state_dict'])

        # Move model to GPU.
        if cuda:
            model.cuda()
        model.eval()

        model_dict[e] = model

    # Load scalar
    scalar_path = os.path.join(workspace, "scalars", filename, "scalar.p")
    (mean_, std_) = cPickle.load(open(scalar_path, 'rb'))
    mean_ = move_data_to_gpu(mean_, cuda, volatile=True)
    std_ = move_data_to_gpu(std_, cuda, volatile=True)

    if mini_num > 0:
        n_every = len(names) / mini_num
    else:
        n_every = 1

    out_wav_dir = os.path.join(workspace, "enh_wavs", filename)
    pp_data.create_folder(out_wav_dir)

    for (cnt, name) in enumerate(names):
        if cnt % n_every == 0:
            audio_path = os.path.join(audio_dir, name)
            (audio, _) = pp_data.read_audio(audio_path, sample_rate)

            audio = pp_data.normalize(audio)
            cmplx_sp = pp_data.calc_sp(audio, fft_size, hop_size, window)
            x = np.abs(cmplx_sp)

            # Process data.
            n_pad = (stack_num - 1) / 2
            x = pp_data.pad_with_border(x, n_pad)
            x = pp_data.mat_2d_to_3d(x, stack_num, hop=1)

            # Predict.
            pred_dict = {}
            for e in target_type:
                pred = forward(model_dict[e], x, mean_, std_, cuda)
                pred = pred.data.cpu().numpy()
                pred_dict[e] = pred
            print(cnt, name)

            # Wiener filter.
            pred_mag_sp = pred_dict['speech'] / (
                pred_dict['speech'] + pred_dict['noise']) * np.abs(cmplx_sp)

            pred_cmplx_sp = stft.real_to_complex(pred_mag_sp, cmplx_sp)
            frames = stft.istft(pred_cmplx_sp)

            cola_constant = stft.get_cola_constant(hop_size, window)
            seq = stft.overlap_add(frames, hop_size, cola_constant)
            seq = seq[0:len(audio)]

            # Write out wav
            out_wav_path = os.path.join(out_wav_dir, name)
            pp_data.write_audio(out_wav_path, seq, sample_rate)
            print("Write out wav to: %s" % out_wav_path)

            if visualize:
                vmin = -5.
                vmax = 5.
                fig, axs = plt.subplots(3, 1, sharex=True)
                axs[0].matshow(np.log(np.abs(cmplx_sp)).T,
                               origin='lower',
                               aspect='auto',
                               cmap='jet')
                axs[1].matshow(np.log(np.abs(pred_dict['speech'])).T,
                               origin='lower',
                               aspect='auto',
                               cmap='jet')
                axs[2].matshow(np.log(np.abs(pred_dict['noise'])).T,
                               origin='lower',
                               aspect='auto',
                               cmap='jet')
                plt.show()
コード例 #3
0
def inference(args):
    workspace = args.workspace
    iter = args.iteration
    stack_num = args.stack_num
    filename = args.filename
    mini_num = args.mini_num
    visualize = args.visualize
    cuda = args.use_cuda and torch.cuda.is_available()
    print("cuda:", cuda)
    audio_type = 'speech'
    
    sample_rate = cfg.sample_rate
    fft_size = cfg.fft_size
    hop_size = cfg.hop_size
    window_type = cfg.window_type

    if window_type == 'hamming':
        window = np.hamming(fft_size)

    # Audio
    audio_dir = "/vol/vssp/msos/qk/workspaces/speech_enhancement/mixed_audios/spectrogram/test/0db"
    # audio_dir = "/user/HS229/qk00006/my_code2015.5-/python/pub_speech_enhancement/mixture2clean_dnn/workspace/mixed_audios/spectrogram/test/0db"
    names = os.listdir(audio_dir)
    
    speech_dir = "/vol/vssp/msos/qk/workspaces/speech_enhancement/timit_wavs/subtest"
    
    # Load model
    model_path = os.path.join(workspace, "models", filename, audio_type, "md_%d_iters.tar" % iter)
    n_freq = 257
    model = DNN(stack_num, n_freq)
    checkpoint = torch.load(model_path)
    model.load_state_dict(checkpoint['state_dict'])
    
    if cuda:
        model.cuda()
        
    # Load scalar
    scalar_path = os.path.join(workspace, "scalars", filename, "scalar.p")
    (mean_, std_) = cPickle.load(open(scalar_path, 'rb'))
    mean_ = move_data_to_gpu(mean_, cuda, volatile=True)
    std_ = move_data_to_gpu(std_, cuda, volatile=True)
    
    if mini_num > 0:
        n_every = len(names) / mini_num
    else:
        n_every = 1
        
    out_wav_dir = os.path.join(workspace, "enh_wavs", filename)
    pp_data.create_folder(out_wav_dir)
    
    dft = pp_data.DFT(fft_size, cuda)
        
    for (cnt, name) in enumerate(names):
        if cnt % n_every == 0:
            audio_path = os.path.join(audio_dir, name)
            (audio0, _) = pp_data.read_audio(audio_path, sample_rate)
            
            audio = pp_data.normalize(audio0)
            
            # Enframe
            frames = stft.enframe(audio, fft_size, hop_size)
            
            # Process data. 
            n_pad = (stack_num - 1) / 2
            x = pp_data.pad_with_border(frames, n_pad)
            x = pp_data.mat_2d_to_3d(x, stack_num, hop=1)
            
            pred_frames = forward(model, x, mean_, std_, cuda)
            
            pred_frames = pred_frames.data.cpu().numpy()
            
            # cola_constant = 0.5
            # seq = stft.overlap_add(pred_frames, hop_size, cola_constant)
            
            pred_frames *= window
            
            cola_constant = stft.get_cola_constant(hop_size, window)
            seq = stft.overlap_add(pred_frames, hop_size, cola_constant)
            seq = seq[0 : len(audio)]
            
            
            # Write out wav
            out_wav_path = os.path.join(out_wav_dir, name)
            pp_data.write_audio(out_wav_path, seq, sample_rate)
            print("Write out wav to: %s" % out_wav_path)
            
            if visualize:
                
                clean_audio_path = os.path.join(speech_dir, name.split('.')[0] + ".WAV")
                (clean_audio, _) = pp_data.read_audio(clean_audio_path, sample_rate)
                clean_audio = pp_data.normalize(clean_audio)
                clean_frames = stft.enframe(clean_audio, fft_size, hop_size)
                
                mix_sp = np.abs(np.fft.rfft(frames * window, norm='ortho'))
                enh_sp = np.abs(np.fft.rfft(pred_frames * window, norm='ortho'))
                clean_sp = np.abs(np.fft.rfft(clean_frames * window, norm='ortho'))
                
                K = 10
                fig, axs = plt.subplots(K/2,2, sharex=True)
                for k in range(K):
                    axs[k / 2, k % 2].plot(frames[k+100], color='y')
                    axs[k / 2, k % 2].plot(clean_frames[k+100], color='r')
                    axs[k / 2, k % 2].plot(pred_frames[k+100], color='b')
                plt.show()
                
                # import crash
                # asdf
                
                vmin = -5.
                vmax = 5.
                fig, axs = plt.subplots(3,1, sharex=True)
                axs[0].matshow(np.log(np.abs(mix_sp)).T, origin='lower', aspect='auto', cmap='jet', vmin=vmin, vmax=vmax)
                axs[1].matshow(np.log(np.abs(clean_sp)).T, origin='lower', aspect='auto', cmap='jet', vmin=vmin, vmax=vmax)
                axs[2].matshow(np.log(np.abs(enh_sp)).T, origin='lower', aspect='auto', cmap='jet', vmin=vmin, vmax=vmax)
                plt.show()
コード例 #4
0
def inference(args):
    cuda = args.use_cuda and torch.cuda.is_available()
    workspace = args.workspace
    model_name = args.model_name
    feat_type = args.feat_type
    script_na = args.script_na

    # Load data.
    te_packed_feat_path = os.path.join(workspace, "packed_features", feat_type,
                                       "test.p")
    [te_x_list, te_y_list,
     te_na_list] = cPickle.load(open(te_packed_feat_path, 'rb'))

    # Scale.
    if True:
        scale_path = os.path.join(workspace, "scalers", feat_type, "scaler.p")
        scaler = pickle.load(open(scale_path, 'rb'))
        te_x_list = pp_data.scale_on_x_list(te_x_list, scaler)

    # Construct model topology.
    n_concat = 3
    te_n_hop = 1
    n_freq = te_x_list[0].shape[-1]
    n_out = te_y_list[0].shape[-1]
    model = Net(n_concat, n_freq, n_out)

    # Init the weights of model using trained weights.
    model_path = os.path.join(workspace, "models", script_na, feat_type,
                              model_name)
    if os.path.isfile(model_path):
        print("Loading checkpoint '%s'" % model_path)
        checkpoint = torch.load(model_path)
        model.load_state_dict(checkpoint['state_dict'])
    else:
        raise Exception("Model path %s does not exist!" % model_path)

    # Move model to GPU.
    if cuda:
        model.cuda()

    # Directory to write out transcript midi files.
    out_midi_dir = os.path.join(workspace, "out_midis",
                                pp_data.get_filename(__file__), feat_type)
    pp_data.create_folder(out_midi_dir)

    # Data to 3d.
    n_half = (n_concat - 1) / 2
    for i1 in xrange(len(te_x_list)):
        x = te_x_list[i1]  # (n_time, n_freq)
        y = te_y_list[i1]  # (n_time, n_out)
        bare_na = os.path.splitext(te_na_list[i1])[0]
        (n_time, n_freq) = x.shape

        zero_pad = np.zeros((n_half, n_freq))
        x = np.concatenate((zero_pad, x, zero_pad), axis=0)
        x3d = pp_data.mat_2d_to_3d(x, n_concat,
                                   te_n_hop)  # (n_time, n_concat, n_freq)

        # Move data to GPU.
        x3d = torch.Tensor(x3d)
        x3d = Variable(x3d)
        if cuda:
            x3d = x3d.cuda()

        # Inference.
        model.eval()
        pred = model(x3d)  # (n_time, n_out)

        # Convert data type to numpy.
        pred = pred.data.cpu().numpy()

        # Threshold and write out predicted piano roll to midi file.
        mid_roll = pp_data.prob_to_midi_roll(pred, 0.5)
        out_path = os.path.join(out_midi_dir, "%s.mid" % bare_na)
        print("Write out to: %s" % out_path)
        pp_data.write_midi_roll_to_midi(mid_roll, out_path)

        # Debug plot.
        if True:
            fig, axs = plt.subplots(3, 1, sharex=True)
            axs[0].matshow(y.T, origin='lower', aspect='auto')
            axs[1].matshow(pred.T, origin='lower', aspect='auto')
            binary_pred = (np.sign(pred - 0.5) + 1) / 2
            axs[2].matshow(binary_pred.T, origin='lower', aspect='auto')
            axs[0].set_title("Ground truth")
            axs[1].set_title("DNN output probability")
            axs[2].set_title("DNN output probability after thresholding")
            for j1 in xrange(3):
                axs[j1].set_ylabel('note index')
                axs[j1].set_xlabel('frames')
                axs[j1].xaxis.set_label_coords(1.06, -0.01)
                axs[j1].xaxis.tick_bottom()
            plt.tight_layout()
            plt.show()
コード例 #5
0
def inference(args):
    workspace = "workspace"
    n_concat = 11
    iter = 50000
    n_window = 320
    n_overlap = 160
    fs = 16000
    # Load model.
    model_path = os.path.join(workspace, "models", "crn_mixdb",
                              "md_%diters.h5" % iter)
    model = load_model(model_path, custom_objects={'keras': keras})
    # Load test data.
    feat_dir = os.path.join(workspace, "features", "spectrogram", "test",
                            "crn_mixdb")
    #feat_dir = os.path.join(workspace, "features", "spectrogram", "train", "office_mixdb")
    names = os.listdir(feat_dir)
    for (cnt, na) in enumerate(names):
        # Load feature.
        feat_path = os.path.join(feat_dir, na)
        data = cPickle.load(open(feat_path, 'rb'))
        [mixed_cmplx_x, speech_x, noise_x, alpha, na] = data
        mixed_x = np.abs(mixed_cmplx_x)
        # Process data.
        n_pad = (n_concat - 1)
        #mixed_x = pad_with_border(mixed_x, n_pad)
        # Cut input spectrogram to 3D segments with n_concat.
        mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat,
                                          hop=11)  #[100, 7, 257]
        #mixed_x = pad_with_border(mixed_x, n_pad)
        #mixed_x_3d = mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1)
        # Predict.
        w, h, l = mixed_x_3d.shape
        pred = model.predict(mixed_x_3d)
        pred_sp = np.reshape(pred, [w * h, l])
        mixed_cmplx_x = mixed_cmplx_x[:w * h, :]
        #pred_sp = pred[:, -1, :]
        print(cnt, na)
        if False:
            fig, axs = plt.subplots(3, 1, sharex=False)
            axs[0].matshow(mixed_x.T,
                           origin='lower',
                           aspect='auto',
                           cmap='jet')
            axs[1].matshow(speech_x.T,
                           origin='lower',
                           aspect='auto',
                           cmap='jet')
            axs[2].matshow(pred_sp.T,
                           origin='lower',
                           aspect='auto',
                           cmap='jet')
            axs[0].set_title("%ddb mixture log spectrogram" % int(1))
            axs[1].set_title("Clean speech log spectrogram")
            axs[2].set_title("Enhanced speech log spectrogram")
            for j1 in range(3):
                axs[j1].xaxis.tick_bottom()
            plt.tight_layout()
            plt.show()
            # Recover enhanced wav.
        #pred_sp = np.exp(pred)
        #pred_sp = pred
        s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming)
        s *= np.sqrt((np.hamming(n_window)**2
                      ).sum())  # Scaler for compensate the amplitude
        # Write out enhanced wav.
        out_path = os.path.join(workspace, "enh_wavs", "test", "crn_mixdb",
                                "%s.enh.wav" % na)
        pp_data.create_folder(os.path.dirname(out_path))
        pp_data.write_audio(out_path, s, fs)
コード例 #6
0
def inference(args):
    """Inference all test data, write out recovered wavs to disk. 
    
    Args:
      workspace: str, path of workspace. 
      tr_snr: float, training SNR. 
      te_snr: float, testing SNR. 
      n_concat: int, number of frames to concatenta, should equal to n_concat 
          in the training stage. 
      iter: int, iteration of model to load. 
      visualize: bool, plot enhanced spectrogram for debug. 
    """
    print(args)
    workspace = args.workspace
    tr_snr = args.tr_snr
    te_snr = args.te_snr
    n_concat = args.n_concat
    iter = args.iteration

    n_window = cfg.n_window
    n_overlap = cfg.n_overlap
    fs = cfg.sample_rate
    scale = True

    # Load model.
    model_path = os.path.join(workspace, "models", "%ddb" % int(tr_snr),
                              "md_%diters.h5" % iter)
    model = load_model(model_path)

    # Load scaler.
    scaler_path = os.path.join(workspace, "packed_features", "spectrogram",
                               "train", "%ddb" % int(tr_snr), "scaler.p")
    scaler = pickle.load(open(scaler_path, 'rb'))

    # Load test data.
    feat_dir = os.path.join(workspace, "features", "spectrogram", "test",
                            "%ddb" % int(te_snr))
    names = os.listdir(feat_dir)

    for (cnt, na) in enumerate(names):
        # Load feature.
        feat_path = os.path.join(feat_dir, na)
        data = cPickle.load(open(feat_path, 'rb'))
        [mixed_cmplx_x, speech_x, noise_x, alpha, na] = data
        mixed_x = np.abs(mixed_cmplx_x)

        # Process data.
        n_pad = (n_concat - 1) / 2
        mixed_x = pp_data.pad_with_border(mixed_x, n_pad)
        mixed_x = pp_data.log_sp(mixed_x)
        speech_x = pp_data.log_sp(speech_x)

        # Scale data.
        if scale:
            mixed_x = pp_data.scale_on_2d(mixed_x, scaler)
            speech_x = pp_data.scale_on_2d(speech_x, scaler)

        # Cut input spectrogram to 3D segments with n_concat.
        mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1)

        # Predict.
        pred = model.predict(mixed_x_3d)
        print(cnt, na)

        # Inverse scale.
        if scale:
            mixed_x = pp_data.inverse_scale_on_2d(mixed_x, scaler)
            speech_x = pp_data.inverse_scale_on_2d(speech_x, scaler)
            pred = pp_data.inverse_scale_on_2d(pred, scaler)

        # Debug plot.
        if args.visualize:
            fig, axs = plt.subplots(3, 1, sharex=False)
            axs[0].matshow(mixed_x.T,
                           origin='lower',
                           aspect='auto',
                           cmap='jet')
            axs[1].matshow(speech_x.T,
                           origin='lower',
                           aspect='auto',
                           cmap='jet')
            axs[2].matshow(pred.T, origin='lower', aspect='auto', cmap='jet')
            axs[0].set_title("%ddb mixture log spectrogram" % int(te_snr))
            axs[1].set_title("Clean speech log spectrogram")
            axs[2].set_title("Enhanced speech log spectrogram")
            for j1 in xrange(3):
                axs[j1].xaxis.tick_bottom()
            plt.tight_layout()
            plt.show()

        # Recover enhanced wav.
        pred_sp = np.exp(pred)
        s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming)
        s *= np.sqrt((np.hamming(n_window)**2
                      ).sum())  # Scaler for compensate the amplitude
        # change after spectrogram and IFFT.

        # Write out enhanced wav.
        out_path = os.path.join(workspace, "enh_wavs", "test",
                                "%ddb" % int(te_snr), "%s.enh.wav" % na)
        pp_data.create_folder(os.path.dirname(out_path))
        pp_data.write_audio(out_path, s, fs)
コード例 #7
0
ファイル: main_dnn.py プロジェクト: flyingleafe/sednn
def inference(workspace,
              tr_snr,
              te_snr,
              n_concat,
              iteration,
              model_name=None,
              visualize=False,
              force=False):
    """Inference all test data, write out recovered wavs to disk.

    Args:
      workspace: str, path of workspace.
      tr_snr: float, training SNR.
      te_snr: float, testing SNR.
      n_concat: int, number of frames to concatenta, should equal to n_concat
          in the training stage.
      iter: int, iteration of model to load.
      visualize: bool, plot enhanced spectrogram for debug.
    """

    n_window = cfg.n_window
    n_overlap = cfg.n_overlap
    fs = cfg.sample_rate
    scale = True

    if model_name is None:
        model_name = '_'.join([str(snr) for snr in tr_snr]) + 'ddbs'

    # Load model.
    model_path = os.path.join(workspace, "models", model_name,
                              "md_%diters.h5" % iteration)
    print('GPU available: ', tf.test.is_gpu_available())

    model = load_model(model_path)

    # Load scaler.
    scaler = read_combined_scaler(workspace, tr_snr)

    for snr in te_snr:
        # Load test data.
        feat_dir = os.path.join(workspace, "features", "spectrogram", "test",
                                "%ddb" % int(snr))
        feat_paths = all_file_paths(feat_dir)

        for (cnt, feat_path) in tqdm(enumerate(feat_paths),
                                     'Inference (creating enhanced speech)'):
            # Check if the enhanced audio is already inferred
            na = str(PurePath(feat_path).relative_to(feat_dir).with_suffix(''))
            out_path = os.path.join(workspace, "enh_wavs", "test", model_name,
                                    "%ddb" % int(snr), "%s.enh.wav" % na)
            if os.path.isfile(out_path) and not force:
                print(f'Enhanced audio {out_path} is already made')
                continue

            # Load feature.
            data = pickle.load(open(feat_path, 'rb'))
            [mixed_cmplx_x, speech_x, noise_x, ir_mask, alpha, na] = data
            mixed_x = np.abs(mixed_cmplx_x)

            # Process data.
            n_pad = (n_concat - 1) / 2
            mixed_x = pp_data.pad_with_border(mixed_x, n_pad)
            mixed_x = pp_data.log_sp(mixed_x)
            speech_x = pp_data.log_sp(speech_x)

            # Scale data.
            if scale:
                mixed_x = pp_data.scale_on_2d(mixed_x, scaler)
                speech_x = pp_data.scale_on_2d(speech_x, scaler)

            # Cut input spectrogram to 3D segments with n_concat.
            mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1)

            # Predict.
            pred = model.predict(mixed_x_3d)
            #print(cnt, na)

            # Inverse scale.
            if scale:
                mixed_x = pp_data.inverse_scale_on_2d(mixed_x, scaler)
                speech_x = pp_data.inverse_scale_on_2d(speech_x, scaler)
                #pred = pp_data.inverse_scale_on_2d(pred, scaler)

            # Debug plot.
            if visualize:
                fig, axs = plt.subplots(3, 1, sharex=False)
                axs[0].matshow(mixed_x.T,
                               origin='lower',
                               aspect='auto',
                               cmap='jet')
                axs[1].matshow(speech_x.T,
                               origin='lower',
                               aspect='auto',
                               cmap='jet')
                axs[2].matshow(pred.T,
                               origin='lower',
                               aspect='auto',
                               cmap='jet')
                axs[0].set_title("%ddb mixture log spectrogram" % int(te_snr))
                axs[1].set_title("Clean speech log spectrogram")
                axs[2].set_title("Enhanced speech log spectrogram")
                for j1 in xrange(3):
                    axs[j1].xaxis.tick_bottom()
                plt.tight_layout()
                plt.show()

            # Recover enhanced wav
            s = recover_wav(pred,
                            mixed_cmplx_x,
                            n_overlap,
                            np.hamming,
                            irr_mask=True)
            s *= np.sqrt((np.hamming(n_window)**2
                          ).sum())  # Scaler for compensate the amplitude
            # change after spectrogram and IFFT.

            # Write out enhanced wav.
            pp_data.create_folder(os.path.dirname(out_path))
            pp_data.write_audio(out_path, s, fs)
コード例 #8
0
def inference(args):
    """Inference all test data, write out recovered wavs to disk. 
    
    Args:
      workspace: str, path of workspace. 
      tr_snr: float, training SNR. 
      te_snr: float, testing SNR. 
      n_concat: int, number of frames to concatenta, should equal to n_concat 
          in the training stage. 
      iter: int, iteration of model to load. 
      visualize: bool, plot enhanced spectrogram for debug. 
    """
    print(args)
    workspace = args.workspace
    tr_snr = args.tr_snr
    te_snr = args.te_snr
    n_concat = args.n_concat
    iter = args.iteration
    data_type = 'IRM'

    n_window = cfg.n_window
    n_overlap = cfg.n_overlap
    fs = cfg.sample_rate
    scale = True

    # Load model.
    if data_type == "DM":
        model_path = os.path.join(workspace, "models", "mixdb",
                                  "md_%diters.h5" % 120000)
    else:
        model_path = os.path.join(workspace, "models", "mask_mixdb",
                                  "md_%diters.h5" % 265000)
    model = load_model(model_path)

    # Load scaler.
    scaler_path = os.path.join(workspace, "packed_features", "spectrogram",
                               "train", "mixdb", "scaler.p")
    scaler = pickle.load(open(scaler_path, 'rb'))

    # Load test data.
    feat_dir = os.path.join(workspace, "features", "spectrogram", "test",
                            "mixdb")
    names = os.listdir(feat_dir)

    for (cnt, na) in enumerate(names):
        # Load feature.
        feat_path = os.path.join(feat_dir, na)
        data = cPickle.load(open(feat_path, 'rb'))
        [mixed_cmplx_x, speech_x, noise_x, alpha, na] = data
        mixed_x = np.abs(mixed_cmplx_x)
        if data_type == "IRM":
            mixed_x = speech_x + noise_x
            mixed_x1 = speech_x + noise_x
        # Process data.
        n_pad = (n_concat - 1) / 2
        mixed_x = pp_data.pad_with_border(mixed_x, n_pad)
        mixed_x = pp_data.log_sp(mixed_x)

        # Scale data.
        if scale:
            mixed_x = pp_data.scale_on_2d(mixed_x, scaler)

        # Cut input spectrogram to 3D segments with n_concat.
        mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1)

        # Predict.
        pred = model.predict(mixed_x_3d)
        if data_type == "IRM":
            pred_sp = pred * mixed_x1
        print(cnt, na)

        # Inverse scale.
        if data_type == "DM":
            pred = pp_data.inverse_scale_on_2d(pred, scaler)
            pred_sp = np.exp(pred)
        # Debug plot.
        # Recover enhanced wav.
        s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming)
        s *= np.sqrt((np.hamming(n_window)**2
                      ).sum())  # Scaler for compensate the amplitude
        # change after spectrogram and IFFT.
        # Write out enhanced wav.
        if data_type == "DM":
            out_path = os.path.join(workspace, "enh_wavs", "test", "mixdb",
                                    "%s.enh.wav" % na)
        else:
            out_path = os.path.join(workspace, "enh_wavs", "test",
                                    "mask_mixdb", "%s.enh.wav" % na)
        pp_data.create_folder(os.path.dirname(out_path))
        pp_data.write_audio(out_path, s, fs)
コード例 #9
0
ファイル: main_dnn.py プロジェクト: zk1001/ClearWave
def inference(args):
    """Inference all test data, write out recovered wavs to disk. 
    
    Args:
      workspace: str, path of workspace. 
      tr_snr: float, training SNR. 
      te_snr: float, testing SNR. 
      n_concat: int, number of frames to concatenta, should equal to n_concat 
          in the training stage. 
      iter: int, iteration of model to load. 
      visualize: bool, plot enhanced spectrogram for debug. 
    """
    print(args)
    workspace = args.workspace
    tr_snr = args.tr_snr
    te_snr = args.te_snr
    n_concat = args.n_concat
    iter = args.iteration
    calc_log = args.calc_log
    model_file = args.model_file

    n_window = cfg.n_window
    n_overlap = cfg.n_overlap
    fs = cfg.sample_rate
    scale = True

    # Build model
    n_concat = 7
    n_freq = 257
    n_hid = 2048
    lr = 1e-3

    model = Sequential()
    model.add(Flatten(input_shape=(n_concat, n_freq)))
    model.add(Dropout(0.1))
    model.add(Dense(n_hid, activation='relu'))
    model.add(Dense(n_hid, activation='relu'))
    model.add(Dense(n_hid, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(Dense(n_hid, activation='relu'))
    model.add(Dense(n_hid, activation='relu'))
    model.add(Dense(n_hid, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(n_hid, activation='relu'))
    model.add(Dense(n_hid, activation='relu'))
    model.add(Dense(n_hid, activation='relu'))
    model.add(Dropout(0.2))
    if calc_log:
        model.add(Dense(n_freq, activation='linear'))
    else:
        model.add(Dense(n_freq, activation='relu'))
    model.summary()

    model.compile(loss='mean_absolute_error', optimizer=Adam(lr=lr))

    # Load model.
    if (model_file == "null"):
        model_path = os.path.join(workspace, "models", "%ddb" % int(tr_snr),
                                  "md_%diters.h5" % iter)
        #model = load_model(model_path)
        model.load_weights(model_path)
    else:
        model.load_weights(model_file)

    # Load scaler.
    if calc_log:
        scaler_path = os.path.join(workspace, "packed_features", "spectrogram",
                                   "train", "%ddb" % int(tr_snr), "scaler.p")
        scaler = pickle.load(open(scaler_path, 'rb'))

    # Load test data.
    feat_dir = os.path.join(workspace, "features", "spectrogram", "test",
                            "%ddb" % int(te_snr))
    names = os.listdir(feat_dir)

    for (cnt, na) in enumerate(names):
        # Load feature.
        feat_path = os.path.join(feat_dir, na)
        data = cPickle.load(open(feat_path, 'rb'))
        [mixed_cmplx_x, speech_x, noise_x, alpha, na] = data
        mixed_x = np.abs(mixed_cmplx_x)

        # Process data.
        n_pad = (n_concat - 1) / 2
        mixed_x = pp_data.pad_with_border(mixed_x, n_pad)
        if calc_log:
            mixed_x = pp_data.log_sp(mixed_x)
            #speech_x = pp_data.log_sp(speech_x)
        else:
            mixed_x = mixed_x
            #speech_x = speech_x

        # Scale data.
        if calc_log:
            mixed_x = pp_data.scale_on_2d(mixed_x, scaler)
            #speech_x = pp_data.scale_on_2d(speech_x, scaler)
        else:
            mixed_x_max = np.max(mixed_x)
            print("max of tr_x:", mixed_x_max)
            mixed_x = mixed_x / mixed_x_max

            speech_x_max = np.max(speech_x)
            print("max of speech_x:", speech_x_max)
            speech_x = speech_x / speech_x_max

        # Cut input spectrogram to 3D segments with n_concat.
        mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1)

        # Predict.
        if False:
            print(mixed_x_3d)
        pred = model.predict(mixed_x_3d)
        print(cnt, na)
        if False:
            print("pred")
            print(pred)
            print("speech")
            print(speech_x)

        # Inverse scale.
        if calc_log:
            mixed_x = pp_data.inverse_scale_on_2d(mixed_x, scaler)
            #speech_x = pp_data.inverse_scale_on_2d(speech_x, scaler)
            pred = pp_data.inverse_scale_on_2d(pred, scaler)
        else:
            mixed_x = mixed_x * mixed_x_max
            #speech_x = speech_x * 16384
            pred = pred * mixed_x_max

        # Debug plot.
        if args.visualize:
            fig, axs = plt.subplots(3, 1, sharex=False)
            axs[0].matshow(mixed_x.T,
                           origin='lower',
                           aspect='auto',
                           cmap='jet')
            #axs[1].matshow(speech_x.T, origin='lower', aspect='auto', cmap='jet')
            axs[2].matshow(pred.T, origin='lower', aspect='auto', cmap='jet')
            axs[0].set_title("%ddb mixture log spectrogram" % int(te_snr))
            axs[1].set_title("Clean speech log spectrogram")
            axs[2].set_title("Enhanced speech log spectrogram")
            for j1 in xrange(3):
                axs[j1].xaxis.tick_bottom()
            plt.tight_layout()
            plt.show()

        # Recover enhanced wav.
        if calc_log:
            pred_sp = np.exp(pred)
        else:
            #gv = 0.025
            #pred_sp = np.maximum(0,pred - gv)
            pred_sp = pred

        if False:
            pred_sp = mixed_x[3:-3]

        s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming)
        s *= np.sqrt((np.hamming(n_window)**2
                      ).sum())  # Scaler for compensate the amplitude
        # change after spectrogram and IFFT.

        # Write out enhanced wav.
        out_path = os.path.join(workspace, "enh_wavs", "test",
                                "%ddb" % int(te_snr), "%s.enh.wav" % na)
        pp_data.create_folder(os.path.dirname(out_path))
        pp_data.write_audio(out_path, s, fs)
        # Write out enhanced pcm 8K pcm_s16le.
        out_pcm_path = os.path.join(workspace, "enh_wavs", "test",
                                    "%ddb" % int(te_snr), "%s.enh.pcm" % na)
        cmd = ' '.join([
            "./ffmpeg -y -i ", out_path,
            " -f s16le -ar 8000 -ac 1 -acodec pcm_s16le ", out_pcm_path
        ])
        os.system(cmd)

        # Write out webrtc-denoised enhanced pcm 8K pcm_s16le.
        ns_out_pcm_path = os.path.join(workspace, "ns_enh_wavs", "test",
                                       "%ddb" % int(te_snr),
                                       "%s.ns_enh.pcm" % na)
        ns_out_wav_path = os.path.join(workspace, "ns_enh_wavs", "test",
                                       "%ddb" % int(te_snr),
                                       "%s.ns_enh.wav" % na)
        pp_data.create_folder(os.path.dirname(ns_out_pcm_path))
        cmd = ' '.join(["./ns", out_pcm_path, ns_out_pcm_path])
        os.system(cmd)
        cmd = ' '.join([
            "./ffmpeg -y -f s16le -ar 8000 -ac 1 -acodec pcm_s16le -i ",
            ns_out_pcm_path, "  ", ns_out_wav_path
        ])
        os.system(cmd)

        cmd = ' '.join(["rm ", out_pcm_path])
        os.system(cmd)
        cmd = ' '.join(["rm ", ns_out_pcm_path])
        os.system(cmd)
コード例 #10
0
ファイル: tmp01.py プロジェクト: zqy1/sednn
def inference(args):
    workspace = args.workspace
    model_name = args.model_name
    stack_num = args.stack_num
    filename = args.filename
    mini_num = args.mini_num
    visualize = args.visualize
    cuda = args.use_cuda and torch.cuda.is_available()
    print("cuda:", cuda)

    sample_rate = cfg.sample_rate
    fft_size = cfg.fft_size
    hop_size = cfg.hop_size
    window_type = cfg.window_type

    if window_type == 'hamming':
        window = np.hamming(fft_size)

    # Audio
    audio_dir = "/vol/vssp/msos/qk/workspaces/speech_enhancement/mixed_audios/spectrogram/test/0db"
    names = os.listdir(audio_dir)

    # Load model
    model_path = os.path.join(workspace, "models", filename, model_name)
    n_freq = 257
    model = DNN(stack_num, n_freq)
    checkpoint = torch.load(model_path)
    model.load_state_dict(checkpoint['state_dict'])

    if cuda:
        model.cuda()

    # Load scalar
    scalar_path = os.path.join(workspace, "scalars", filename, "scalar.p")
    (mean_, std_) = cPickle.load(open(scalar_path, 'rb'))
    mean_ = move_data_to_gpu(mean_, cuda, volatile=True)
    std_ = move_data_to_gpu(std_, cuda, volatile=True)

    if mini_num > 0:
        n_every = len(names) / mini_num
    else:
        n_every = 1

    for (cnt, name) in enumerate(names):
        if cnt % n_every == 0:
            audio_path = os.path.join(audio_dir, name)
            (audio, _) = pp_data.read_audio(audio_path, sample_rate)

            audio = pp_data.normalize(audio)
            sp = pp_data.calc_sp(audio, fft_size, hop_size, window)
            x = np.abs(sp)

            # Process data.
            n_pad = (stack_num - 1) / 2
            x = pp_data.pad_with_border(x, n_pad)
            x = pp_data.mat_2d_to_3d(x, stack_num, hop=1)

            output = forward(model, x, mean_, std_, cuda)
            output = output.data.cpu().numpy()

            print(output.shape)
            if visualize:
                fig, axs = plt.subplots(2, 1, sharex=True)
                axs[0].matshow(np.log(np.abs(sp)).T,
                               origin='lower',
                               aspect='auto',
                               cmap='jet')
                axs[1].matshow(np.log(np.abs(output)).T,
                               origin='lower',
                               aspect='auto',
                               cmap='jet')
                plt.show()

            import crash
            pause
コード例 #11
0
def inference(args):
    """Inference all test data, write out recovered wavs to disk. 
    
    Args:
      workspace: str, path of workspace. 
      tr_snr: float, training SNR. 
      te_snr: float, testing SNR. 
      n_concat: int, number of frames to concatenta, should equal to n_concat 
          in the training stage. 
      iter: int, iteration of model to load. 
      visualize: bool, plot enhanced spectrogram for debug. 
    """
    print(args)
    workspace = args.workspace
    #tr_snr = args.tr_snr
    #te_snr = args.te_snr
    n_concat = args.n_concat
    #iter = args.iteration
    TF = args.TF
    model_name = args.model_name
    
    n_window = cfg.n_window
    n_overlap = cfg.n_overlap
    #snr = cfg.SNR
    n_hop = int(n_window-n_overlap)
    fs = cfg.sample_rate
    scale = True
    
    # Load model
    t1 = time.time()
    #model_path = os.path.join(workspace, "saved_models", "%s" % model_name, "weights-checkpoint-25-0.41.h5")
    mag_model_root = os.path.join(workspace, "saved_models", "%s" % model_name )
    #model_root = '/home/szuer/CI_DNN/workspace_16kHz/cis_strategy/noise10/mixture/saved_models/0/sdnn1'
    mag_model_files = find_models(mag_model_root)
    epoch_num = []
    for i in range(len(mag_model_files)):
        epoch_num.append(int(mag_model_files[i].split("/")[-1].split('-')[2]))
    mag_model_index = epoch_num.index(max(epoch_num))
    mag_model_path = mag_model_files[mag_model_index]
    print("The selected model path is %s :" % mag_model_path)
    
    mag_model = load_model(mag_model_path)
    
    '''
    # loading phase model
    phase_model_root = os.path.join(workspace, "phase_saved_models", "%s" % model_name )
    #model_root = '/home/szuer/CI_DNN/workspace_16kHz/cis_strategy/noise10/mixture/saved_models/0/sdnn1'
    phase_model_files = find_models(phase_model_root)
    epoch_num1 = []
    for i in range(len(phase_model_files)):
        epoch_num1.append(int(phase_model_files[i].split("/")[-1].split('-')[2]))
    phase_model_index = epoch_num1.index(max(epoch_num1))
    phase_model_path = phase_model_files[phase_model_index]
    print("The selected model path is %s :" % phase_model_path)
    
    phase_model = load_model(phase_model_path)
    '''
    # Load scaler
    mag_scaler_path = os.path.join(workspace, "packed_features", "train", "mag_scaler.p")
    mag_scaler = pickle.load(open(mag_scaler_path, 'rb'))
    
    #phase_scaler_path = os.path.join(workspace, "packed_features", "train", "phase_scaler.p")
    #phase_scaler = pickle.load(open(phase_scaler_path, 'rb'))
    
    # Load test data. 
    feat_dir = os.path.join(workspace, "features", "test")
    names = os.listdir(feat_dir)

    for (cnt, na) in enumerate(names):
        # Load feature. 
        feat_path = os.path.join(feat_dir, na)
        data = cPickle.load(open(feat_path, 'rb'))
        [mixed_cmplx_x, speech_cmplx_x] = data
        n_pad = (n_concat - 1) / 2
        
        if TF == "spectrogram":
            mixed_x = np.abs(mixed_cmplx_x)
            # mixed_phase = np.angle(mixed_cmplx_x)
            # Process data. 
            #n_pad = (n_concat - 1) / 2
            mixed_x = pp_data.pad_with_border(mixed_x, n_pad)
            mixed_x = pp_data.log_sp(mixed_x)
            # mixed_phase = pp_data.pad_with_border(mixed_phase, n_pad)
            
            # speech_x = pp_data.log_sp(np.abs(speech_cmplx_x))
            #speech_phase = np.angle(speech_cmplx_x)

            
        else:
            raise Exception("TF must be spectrogram, timedomain or fftmagnitude!")
            
        # Scale data. 
        if scale:
            mixed_x = pp_data.scale_on_2d(mixed_x, mag_scaler)
            # speech_x = pp_data.scale_on_2d(speech_x, mag_scaler)
            #mixed_phase = pp_data.scale_on_2d(mixed_phase, phase_scaler)
            #speech_phase = pp_data.scale_on_2d(speech_phase, phase_scaler)
        
        # Cut input spectrogram to 3D segments with n_concat. 
        #mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1)
        mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1)
        #mixed_phase_3d = pp_data.mat_2d_to_3d(mixed_phase, agg_num=n_concat, hop=1)
        #print("loading data time: %s s" % (time.time() - t1,))
        '''
        layer_1 = K.function([model.layers[0].input], [model.layers[2].output])#第一个 model.layers[0],不修改,表示输入数据;第二个model.layers[you wanted],修改为你需要输出的层数的编号
        f1 = layer_1([mixed_x_3d])[0]#只修改inpu_image
        #第一层卷积后的特征图展示,输出是(1,149,149,32),(样本个数,特征图尺寸长,特征图尺寸宽,特征图个数)
        for _ in range(12):
            show_img = f1[1, :, :, _]
            show_img.shape = [1, 257]
            plt.subplot(3, 4, _ + 1)
            plt.imshow(show_img.T, cmap='gray')
            plt.axis('off')
        plt.show()
        '''
        # Predict. 
        t2 = time.time()
        mag_pred = mag_model.predict(mixed_x_3d)
        #phase_pred = phase_model.predict(mixed_phase_3d)
        print("model predicts %d utterance : %s successfully" % (cnt, na))
        #print(pred)
        
        # Inverse scale. 
        if scale:
            # mixed_x = pp_data.inverse_scale_on_2d(mixed_x, mag_scaler)
            # speech_x = pp_data.inverse_scale_on_2d(speech_x, mag_scaler)
            mag_pred = pp_data.inverse_scale_on_2d(mag_pred, mag_scaler)
            
            #mixed_phase = pp_data.inverse_scale_on_2d(mixed_phase, phase_scaler)
            #speech_phase = pp_data.inverse_scale_on_2d(speech_phase, phase_scaler)
            #phase_pred = pp_data.inverse_scale_on_2d(phase_pred, phase_scaler)
        
       
                    

        # Recover enhanced wav. 
        #pred_sp = np.exp(pred)
        if TF == "spectrogram":
            pred_sp = (10**(mag_pred/10))-1e-10
            #pred_ph = np.exp(1j * phase_pred)
            '''
            R = np.multiply(pred_sp, pred_ph)
            result = librosa.istft(R.T,
                                   hop_length=n_hop,
                                   win_length=cfg.n_window,
                                   window=scipy.signal.hamming, center=False)
            result /= abs(result).max()
            y_out = result*0.8'''
            #s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming)
            #s *= np.sqrt((np.hamming(n_window)**2).sum())   # Scaler for compensate the amplitude 
            s = spectra_to_wav(pred_sp, mixed_cmplx_x, n_window, n_hop, 'hamming')
            
        # Write out enhanced wav. 
        out_path = os.path.join(workspace, "enh_flipphase", "test", "%s" % model_name, "{}_fft_dnn_map.wav".format(na.split('.')[0]))
        pp_data.create_folder(os.path.dirname(out_path))
        pp_data.write_audio(out_path, s, fs)
        print("predict an utterance time: %s s" % (time.time() - t2,))
        
    print("total test time: %s s" % (time.time() - t1,))    
コード例 #12
0
def inference1111(args):
    """Inference all test data, write out recovered wavs to disk. 
    
    Args:
      workspace: str, path of workspace. 
      tr_snr: float, training SNR. 
      te_snr: float, testing SNR. 
      n_concat: int, number of frames to concatenta, should equal to n_concat 
          in the training stage. 
      iter: int, iteration of model to load. 
      visualize: bool, plot enhanced spectrogram for debug. 
    """
    print(args)
    workspace = args.workspace
    #tr_snr = args.tr_snr
    #te_snr = args.te_snr
    n_concat = args.n_concat
    #iter = args.iteration
    TF = args.TF
    model_name = args.model_name
    
    n_window = cfg.n_window
    n_overlap = cfg.n_overlap
    #snr = cfg.SNR
    n_hop = int(n_window-n_overlap)
    fs = cfg.sample_rate
    scale = True
    
    # Load model
    t1 = time.time()
    #model_path = os.path.join(workspace, "saved_models", "%s" % model_name, "weights-checkpoint-25-0.41.h5")
    model_root = os.path.join(workspace, "saved_models", "%s" % model_name )
    #model_root = '/home/szuer/CI_DNN/workspace_16kHz/cis_strategy/noise10/mixture/saved_models/0/sdnn1'
    model_files = find_models(model_root)
    epoch_num = []
    for i in range(len(model_files)):
        epoch_num.append(int(model_files[i].split("/")[-1].split('-')[2]))
    model_index = epoch_num.index(max(epoch_num))
    model_path = model_files[model_index]
    print("The selected model path is %s :" % model_path)
    
    model = load_model(model_path)
    
    # Load scaler
    scaler_path = os.path.join(workspace, "packed_features", "train", "scaler.p")
    scaler = pickle.load(open(scaler_path, 'rb'))
    
    # Load test data. 
    feat_dir = os.path.join(workspace, "features", "test")
    names = os.listdir(feat_dir)

    for (cnt, na) in enumerate(names):
        # Load feature. 
        feat_path = os.path.join(feat_dir, na)
        data = cPickle.load(open(feat_path, 'rb'))
        [mixed_cmplx_x, speech_x, na] = data
        n_pad = (n_concat - 1) / 2
        
        if TF == "spectrogram":
            mixed_x = np.abs(mixed_cmplx_x)
        
            # Process data. 
            #n_pad = (n_concat - 1) / 2
            mixed_x = pp_data.pad_with_border(mixed_x, n_pad)
            mixed_x = pp_data.log_sp(mixed_x)
            speech_x = pp_data.log_sp(speech_x)
            
        elif TF == "timedomain":
            #n_pad = (n_concat - 1) / 2
            mixed_x = pp_data.pad_with_border(mixed_cmplx_x, n_pad)
            
        elif TF == "fftmagnitude":
            #n_pad = (n_concat - 1) / 2
            mixed_x = np.abs(mixed_cmplx_x)
            mixed_x = pp_data.pad_with_border(mixed_x, n_pad)
            
        else:
            raise Exception("TF must be spectrogram, timedomain or fftmagnitude!")
            
        # Scale data. 
        if scale:
            mixed_x = pp_data.scale_on_2d(mixed_x, scaler)
            speech_x = pp_data.scale_on_2d(speech_x, scaler)
        
        # Cut input spectrogram to 3D segments with n_concat. 
        #mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1)
        mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1)
        #print("loading data time: %s s" % (time.time() - t1,))
        '''
        layer_1 = K.function([model.layers[0].input], [model.layers[2].output])#第一个 model.layers[0],不修改,表示输入数据;第二个model.layers[you wanted],修改为你需要输出的层数的编号
        f1 = layer_1([mixed_x_3d])[0]#只修改inpu_image
        #第一层卷积后的特征图展示,输出是(1,149,149,32),(样本个数,特征图尺寸长,特征图尺寸宽,特征图个数)
        for _ in range(12):
            show_img = f1[1, :, :, _]
            show_img.shape = [1, 257]
            plt.subplot(3, 4, _ + 1)
            plt.imshow(show_img.T, cmap='gray')
            plt.axis('off')
        plt.show()
        '''
        # Predict. 
        t2 = time.time()
        pred = model.predict(mixed_x_3d)
        print("model predicts %d utterance : %s successfully" % (cnt, na))
        #print(pred)
        
        # Inverse scale. 
        if scale:
            mixed_x = pp_data.inverse_scale_on_2d(mixed_x, scaler)
            speech_x = pp_data.inverse_scale_on_2d(speech_x, scaler)
            pred = pp_data.inverse_scale_on_2d(pred, scaler)
        
        #(frames, frame_length) = pred.shape
        #print("pred domensions %d and %d : " % (frames, frame_length))
        # Debug plot. 
        if args.visualize:
            if TF == "spectrogram":
                fig, axs = plt.subplots(3,1, sharex=False)
                axs[0].matshow(mixed_x.T, origin='lower', aspect='auto', cmap='jet')
                axs[1].matshow(speech_x.T, origin='lower', aspect='auto', cmap='jet')
                axs[2].matshow(pred.T, origin='lower', aspect='auto', cmap='jet')
                axs[0].set_title("%ddb mixture log spectrogram" % int(te_snr))
                axs[1].set_title("Clean speech log spectrogram")
                axs[2].set_title("Enhanced speech log spectrogram")
                for j1 in xrange(3):
                    axs[j1].xaxis.tick_bottom()
                    plt.tight_layout()
                    plt.savefig('debug_model_spectra.png')
                    plt.show()
            elif TF == "timedomain":
                fig, axs = plt.subplots(3,1, sharex=False)
                axs[0].matshow(mixed_x.T, origin='lower', aspect='auto', cmap='jet')
                axs[1].matshow(speech_x.T, origin='lower', aspect='auto', cmap='jet')
                axs[2].matshow(pred.T, origin='lower', aspect='auto', cmap='jet')
                axs[0].set_title("%ddb mixture time domain" % int(te_snr))
                axs[1].set_title("Clean speech time domain")
                axs[2].set_title("Enhanced speech time domain")
                for j1 in xrange(3):
                    axs[j1].xaxis.tick_bottom()
                    plt.tight_layout()
                    plt.savefig('debug model_time.png')
                    plt.show()
            else:
                raise Exception("TF must be spectrogram or timedomain!")
                    

        # Recover enhanced wav. 
        #pred_sp = np.exp(pred)
        if TF == "spectrogram":
            pred_sp = (10**(pred/20))-1e-10
            #s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming)
            #s *= np.sqrt((np.hamming(n_window)**2).sum())   # Scaler for compensate the amplitude 
            s = spectra_to_wav(pred_sp, mixed_cmplx_x, n_window, n_hop, 'hamming')
                                                        # change after spectrogram and IFFT. 
        elif TF == "timedomain":
            s = time_recover_wav(pred, n_window, n_hop, 'hamming')
            #s *= np.sqrt((np.hamming(n_window)**2).sum())
            
        elif TF == "fftmagnitude":
            #n_pad = (n_concat - 1) / 2
            s = spectra_to_wav(pred, mixed_cmplx_x, n_window, n_hop, 'hamming')
            
        else:
            raise Exception("TF must be spectrogram timedomain or fftmagnitude!")
            
        # Write out enhanced wav. 
        out_path = os.path.join(workspace, "enh_wavs", "test", "%s" % model_name, "%s.wav" % na)
        pp_data.create_folder(os.path.dirname(out_path))
        pp_data.write_audio(out_path, s, fs)
        print("predict an utterance time: %s s" % (time.time() - t2,))
        
    print("total test time: %s s" % (time.time() - t1,))
コード例 #13
0
def predict_folder(input_file_folder: object, output_file_folder: object) -> object:
    # Load model.
    data_type = "test"
    model_path = os.path.join(conf1.model_dir, "md_%diters.h5" % conf1.iterations)
    model = load_model(model_path)

    # Load scaler.
    # if scale:
    scaler_path = os.path.join(conf1.packed_feature_dir, data_type, "scaler.p")
    scaler = pickle.load(open(scaler_path, 'rb'))

    # Load test data.
    # names = os.listdir(input_file_folder)

    names = [f for f in sorted(os.listdir(input_file_folder)) if f.startswith("mix")]

    mixed_all = []
    pred_all = []
    for (cnt, na) in enumerate(names):
        # Load feature.
        file_path = os.path.join(input_file_folder, na)
        (a, _) = pp.read_audio(file_path)
        mixed_complex = pp.calc_sp(a, 'complex')


        mixed_x = np.abs(mixed_complex)

        # Process data.
        n_pad = (conf1.n_concat - 1) / 2
        mixed_x = pp.pad_with_border(mixed_x, n_pad)
        mixed_x = pp.log_sp(mixed_x)
        # speech_x = dnn1_train.log_sp(speech_x)

        # Scale data.
        # if scale:
        mixed_x = pp.scale_on_2d(mixed_x, scaler)

        # Cut input spectrogram to 3D segments with n_concat.
        mixed_x_3d = pp.mat_2d_to_3d(mixed_x, agg_num=conf1.n_concat, hop=1)


        # Predict.
        pred = model.predict(mixed_x_3d)
        print(cnt, na)

        # Inverse scale.
        #if scale:
        mixed_x = pp.inverse_scale_on_2d(mixed_x, scaler)
        # speech_x = dnn1_train.inverse_scale_on_2d(speech_x, scaler)
        pred = pp.inverse_scale_on_2d(pred, scaler)

        # Debug plot.
        if visualize_plot:
            visualize(mixed_x, pred)

        mixed_all.append(mixed_complex)
        pred_all.append(real_to_complex(pred, mixed_complex))


        # Recover enhanced wav.
        pred_sp = np.exp(pred)
        s = recover_wav(pred_sp, mixed_complex, conf1.n_overlap, np.hamming)
        s *= np.sqrt((np.hamming(conf1.n_window) ** 2).sum())  # Scaler for compensate the amplitude
        # change after spectrogram and IFFT.

        # Write out enhanced wav.

        pp.create_folder(output_file_folder)
        audio_path = os.path.join(output_file_folder, "enh_%s" % na)
        pp.write_audio(audio_path, s, conf1.sample_rate)

    return mixed_all, pred_all