Exemple #1
0
    def enhance(self, wav_dir):

        noisy_speech = utils.read_raw(wav_dir)
        temp_dir = './temp/temp.npy'
        np.save(temp_dir, noisy_speech)

        test_dr = dr.DataReader(temp_dir,
                                '',
                                self.norm_path,
                                dist_num=config.dist_num,
                                is_training=False,
                                is_shuffle=False)
        mean, std = test_dr.norm_process(self.norm_path + '/norm_noisy.mat')

        while True:
            test_inputs, test_labels, test_inphase, test_outphase = test_dr.whole_batch(
                test_dr.num_samples)
            if config.mode != 'lstm' and config.mode != 'fcn':
                feed_dict = {
                    self.node_inputs: test_inputs,
                    self.node_labels: test_labels,
                    self.node_keep_prob: 1.0
                }
            else:
                feed_dict = {
                    self.node_inputs: test_inputs,
                    self.node_labels: test_labels
                }

            pred = self.sess.run(self.node_prediction, feed_dict=feed_dict)

            if test_dr.file_change_checker():
                print(wav_dir)

                lpsd = np.expand_dims(np.reshape(pred, [-1, config.freq_size]),
                                      axis=2)

                lpsd = np.squeeze((lpsd * std * config.global_std) + mean)

                recon_speech = utils.get_recon(np.transpose(lpsd, (1, 0)),
                                               np.transpose(
                                                   test_inphase, (1, 0)),
                                               win_size=config.win_size,
                                               win_step=config.win_step,
                                               fs=config.fs)

                test_dr.reader_initialize()

                break

        file_dir = self.save_dir + '/' + os.path.basename(wav_dir).replace(
            'noisy', 'enhanced').replace('raw', 'wav')
        librosa.output.write_wav(file_dir,
                                 recon_speech,
                                 int(config.fs),
                                 norm=True)

        return recon_speech
def speech_enhance(wav_dir, graph_name):

    noisy_speech = utils.read_raw(wav_dir)

    temp_dir = './temp/temp.npy'
    np.save(temp_dir, noisy_speech)
    graph = gt.load_graph(graph_name)
    norm_path = os.path.abspath('./data/train/norm')

    test_dr = dr.DataReader(temp_dir, '', norm_path, dist_num=config.dist_num, is_training=False, is_shuffle=False)

    node_inputs = graph.get_tensor_by_name('prefix/model_1/inputs:0')
    node_labels = graph.get_tensor_by_name('prefix/model_1/labels:0')
    node_keep_prob = graph.get_tensor_by_name('prefix/model_1/keep_prob:0')
    node_prediction = graph.get_tensor_by_name('prefix/model_1/pred:0')

    pred = []
    lab = []

    sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
    sess_config.gpu_options.allow_growth = True

    while True:

        test_inputs, test_labels = test_dr.next_batch(config.test_batch_size)

        feed_dict = {node_inputs: test_inputs, node_labels: test_labels, node_keep_prob: 1.0}

        with tf.Session(graph=graph, config=sess_config) as sess:
            pred_temp, lab_temp = sess.run([node_prediction, node_labels], feed_dict=feed_dict)

        pred.append(pred_temp)
        lab.append(lab_temp)

        # print(test_dr.file_change_checker())
        if test_dr.file_change_checker():
            print(wav_dir)
            phase = test_dr.phase[0]

            lpsd = np.expand_dims(np.reshape(np.concatenate(pred, axis=0), [-1, config.freq_size])[0:phase.shape[0], :], axis=2)

            mean, std = test_dr.norm_process(norm_path + '/norm_noisy.mat')

            lpsd = np.squeeze((lpsd * std) + mean)  # denorm

            recon_speech = utils.get_recon(np.transpose(lpsd, (1, 0)), np.transpose(phase, (1, 0)),
                                           win_size=config.win_size, win_step=config.win_step, fs=config.fs)

            # plt.plot(recon_speech)
            # plt.show()
            # lab = np.reshape(np.asarray(lab), [-1, 1])
            test_dr.reader_initialize()
            break

    return recon_speech
Exemple #3
0
    def do_summary(self, m_summary, sess, itr):

        valid_path = self.valid_path
        clean_speech = self.clean_speech
        clean_speech = utils.identity_trans(clean_speech)

        noisy_speech = self.noisy_speech
        noisy_speech = utils.identity_trans(noisy_speech)

        temp_dir = self.temp_dir
        name = self.name
        logs_dir = self.logs_dir

        writer = SummaryWriter(log_dir=self.logs_dir + '/summary')

        summary_dr = dr.DataReader(temp_dir, '', valid_path["norm_path"], dist_num=config.dist_num, is_training=False,
                                   is_shuffle=False)
        pred = []

        while True:

            summary_inputs, summary_labels = summary_dr.next_batch(config.batch_size)

            feed_dict = {m_summary.inputs: summary_inputs, m_summary.labels: summary_labels, m_summary.keep_prob: 1.0}

            pred_temp = sess.run(m_summary.pred, feed_dict=feed_dict)

            pred.append(pred_temp)

            if summary_dr.file_change_checker():
                phase = summary_dr.phase[0]

                lpsd = np.expand_dims(
                    np.reshape(np.concatenate(pred, axis=0), [-1, config.freq_size])[0:phase.shape[0], :],
                    axis=2)

                mean, std = summary_dr.norm_process(valid_path["norm_path"] + '/norm_noisy.mat')

                lpsd = np.squeeze((lpsd * std * 1.18) + mean)  # denorm

                recon_speech = utils.get_recon(np.transpose(lpsd, (1, 0)), np.transpose(phase, (1, 0)),
                                               win_size=config.win_size, win_step=config.win_step, fs=config.fs)

                # plt.plot(recon_speech)
                # plt.show()
                # lab = np.reshape(np.asarray(lab), [-1, 1])
                summary_dr.reader_initialize()
                break

        # write summary

        if itr == config.summary_step:
            writer.close()
            self.noisy_measure = utils.se_eval(clean_speech,
                                          np.squeeze(noisy_speech), float(config.fs))
            summary_fname = tf.summary.text(name + '_filename', tf.convert_to_tensor(self.noisy_dir))

            if name == 'train':

                config_str = "<br>sampling frequency: %d</br>" \
                             "<br>window step: %d ms</br>" \
                             "<br>window size: %d ms</br>" \
                             "<br>fft size: %d</br>" \
                             "<br>learning rate: %f</br><br>learning rate decay: %.4f</br><br>learning" \
                             " rate decay frequency: %.4d</br>" \
                             "<br>dropout rate: %.4f</br><br>max epoch:" \
                             " %.4e</br><br>batch size: %d</br><br>model type: %s</br>"\
                             % (config.fs, (config.win_step/config.fs*1000), (config.win_size/config.fs*1000),
                                config.nfft, config.lr, config.lrDecayRate, config.lrDecayFreq, config.keep_prob,
                                config.max_epoch, config.batch_size, config.mode)

                summary_config = tf.summary.text(name + '_configuration', tf.convert_to_tensor(config_str))

                code_list = []
                read_flag = False

                with open('./lib/trnmodel.py', 'r') as f:
                    while True:
                        line = f.readline()
                        if "def inference(self, inputs):" in line:
                            read_flag = True

                        if "return fm" in line:
                            code_list.append('<br>' + line.replace('\n', '') + '</br>')
                            break

                        if read_flag:
                            code_list.append('<br>' + line.replace('\n', '') + '</br>')

                code_list = "<pre>" + "".join(code_list) + "</pre>"

                summary_model = tf.summary.text('train_model', tf.convert_to_tensor(code_list))

                summary_op = tf.summary.merge([summary_fname, summary_config, summary_model])
            else:
                summary_op = tf.summary.merge([summary_fname])

            with tf.Session() as sess:
                summary_writer = tf.summary.FileWriter(logs_dir + '/summary/text')
                text = sess.run(summary_op)
                summary_writer.add_summary(text, 1)
            summary_writer.close()

            writer = SummaryWriter(log_dir=logs_dir + '/summary')

            writer.add_audio(name + '_audio_ref' + '/clean', clean_speech
                             /np.max(np.abs(clean_speech)), itr,
                             sample_rate=config.fs)
            writer.add_audio(name + '_audio_ref' + '/noisy', noisy_speech
                             /np.max(np.abs(noisy_speech)), itr,
                             sample_rate=config.fs)
            clean_S = get_spectrogram(clean_speech)
            noisy_S = get_spectrogram(noisy_speech)

            writer.add_image(name + '_spectrogram_ref' + '/clean', clean_S, itr)  # image_shape = (C, H, W)
            writer.add_image(name + '_spectrogram_ref' + '/noisy', noisy_S, itr)  # image_shape = (C, H, W)

        enhanced_measure = utils.se_eval(clean_speech, recon_speech, float(config.fs))
        writer.add_scalars(name + '_speech_quality' + '/pesq', {'enhanced': enhanced_measure['pesq'],
                                                                'ref': self.noisy_measure['pesq']}, itr)
        writer.add_scalars(name + '_speech_quality' + '/stoi', {'enhanced': enhanced_measure['stoi'],
                                                                'ref': self.noisy_measure['stoi']}, itr)
        writer.add_scalars(name + '_speech_quality' + '/lsd', {'enhanced': enhanced_measure['lsd'],
                                                               'ref': self.noisy_measure['lsd']}, itr)
        writer.add_scalars(name + '_speech_quality' + '/ssnr', {'enhanced': enhanced_measure['ssnr'],
                                                                'ref': self.noisy_measure['ssnr']}, itr)

        writer.add_audio(name + '_audio_enhanced' + '/enhanced', recon_speech/np.max(np.abs(recon_speech)),
                         itr, sample_rate=config.fs)
        enhanced_S = get_spectrogram(recon_speech)
        writer.add_image(name + '_spectrogram_enhanced' + '/enhanced', enhanced_S, itr)  # image_shape = (C, H, W)
        writer.close()
Exemple #4
0
def se_test(wav_dir, noise_dir, snr, noise_type=1):

    # clean_speech, clean_fs = librosa.load(wav_dir, config.fs)
    clean_speech = utils.read_raw(wav_dir)
    eng = matlab.engine.start_matlab()

    # noisy_speech = np.array(eng.noise_add(wav_dir, noise_dir, noise_type, snr, nargout=1))
    # noisy_speech, noisy_fs = librosa.load(noise_dir, config.fs)
    noisy_speech = utils.read_raw(noise_dir)

    # noisy_measure = se_eval(clean_speech, np.squeeze(noisy_speech), float(config.fs))

    temp_dir = './data/test/temp/temp.npy'

    np.save(temp_dir, noisy_speech)
    graph_name = sorted(glob.glob('./saved_model/*.pb'))[-1]
    graph = gt.load_graph(graph_name)
    norm_path = os.path.abspath('./data/train/norm')

    test_dr = dr.DataReader(temp_dir,
                            '',
                            norm_path,
                            dist_num=config.dist_num,
                            is_training=False,
                            is_shuffle=False)

    node_inputs = graph.get_tensor_by_name('prefix/model_1/inputs:0')
    node_labels = graph.get_tensor_by_name('prefix/model_1/labels:0')
    node_keep_prob = graph.get_tensor_by_name('prefix/model_1/keep_prob:0')
    node_prediction = graph.get_tensor_by_name('prefix/model_1/pred:0')

    pred = []
    lab = []

    while True:

        test_inputs, test_labels = test_dr.next_batch(config.test_batch_size)

        feed_dict = {
            node_inputs: test_inputs,
            node_labels: test_labels,
            node_keep_prob: 1.0
        }

        sess_config = tf.ConfigProto(allow_soft_placement=True,
                                     log_device_placement=False)
        sess_config.gpu_options.allow_growth = True

        with tf.Session(graph=graph, config=sess_config) as sess:
            pred_temp, lab_temp = sess.run([node_prediction, node_labels],
                                           feed_dict=feed_dict)

        pred.append(pred_temp)
        lab.append(lab_temp)

        if test_dr.file_change_checker():
            phase = test_dr.phase[0]

            lpsd = np.expand_dims(np.reshape(np.concatenate(
                pred, axis=0), [-1, config.freq_size])[0:phase.shape[0], :],
                                  axis=2)

            mean, std = test_dr.norm_process(norm_path + '/norm_noisy.mat')

            lpsd = np.squeeze((lpsd * std) + mean)  # denorm

            recon_speech = utils.get_recon(np.transpose(lpsd, (1, 0)),
                                           np.transpose(phase, (1, 0)),
                                           win_size=config.win_size,
                                           win_step=config.win_step,
                                           fs=config.fs)

            # plt.plot(recon_speech)
            # plt.show()
            # lab = np.reshape(np.asarray(lab), [-1, 1])
            test_dr.reader_initialize()
            break

    # os.popen('rm -rf ' + temp_dir)

    noisy_measure = utils.se_eval(
        clean_speech[0:recon_speech.shape[0]],
        np.squeeze(noisy_speech[0:recon_speech.shape[0]]), float(config.fs))

    enhanced_measure = utils.se_eval(clean_speech[0:recon_speech.shape[0]],
                                     recon_speech, float(config.fs))
    print("pesq: %.4f -> %.4f" %
          (noisy_measure["pesq"], enhanced_measure["pesq"]))
    print("lsd: %.4f -> %.4f" %
          (noisy_measure["lsd"], enhanced_measure["lsd"]))
    print("stoi: %.4f -> %.4f" %
          (noisy_measure["stoi"], enhanced_measure["stoi"]))
    print("ssnr: %.4f -> %.4f" %
          (noisy_measure["ssnr"], enhanced_measure["ssnr"]))

    plt.subplot(3, 1, 1)
    S = librosa.amplitude_to_db(librosa.stft(
        clean_speech[0:recon_speech.shape[0]],
        hop_length=config.win_step,
        win_length=config.win_size,
        n_fft=config.nfft),
                                ref=np.max)
    ld.specshow(S, y_axis='linear', hop_length=config.win_step, sr=config.fs)

    plt.subplot(3, 1, 2)
    S = librosa.amplitude_to_db(librosa.stft(np.squeeze(
        noisy_speech[0:recon_speech.shape[0]]),
                                             hop_length=config.win_step,
                                             win_length=config.win_size,
                                             n_fft=config.nfft),
                                ref=np.max)
    ld.specshow(S, y_axis='linear', hop_length=config.win_step, sr=config.fs)

    plt.subplot(3, 1, 3)
    S = librosa.amplitude_to_db(librosa.stft(recon_speech,
                                             hop_length=config.win_step,
                                             win_length=config.win_size,
                                             n_fft=config.nfft),
                                ref=np.max)
    ld.specshow(S, y_axis='linear', hop_length=config.win_step, sr=config.fs)

    plt.show()

    return recon_speech