def enhance(self, wav_dir): noisy_speech = utils.read_raw(wav_dir) temp_dir = './temp/temp.npy' np.save(temp_dir, noisy_speech) test_dr = dr.DataReader(temp_dir, '', self.norm_path, dist_num=config.dist_num, is_training=False, is_shuffle=False) mean, std = test_dr.norm_process(self.norm_path + '/norm_noisy.mat') while True: test_inputs, test_labels, test_inphase, test_outphase = test_dr.whole_batch( test_dr.num_samples) if config.mode != 'lstm' and config.mode != 'fcn': feed_dict = { self.node_inputs: test_inputs, self.node_labels: test_labels, self.node_keep_prob: 1.0 } else: feed_dict = { self.node_inputs: test_inputs, self.node_labels: test_labels } pred = self.sess.run(self.node_prediction, feed_dict=feed_dict) if test_dr.file_change_checker(): print(wav_dir) lpsd = np.expand_dims(np.reshape(pred, [-1, config.freq_size]), axis=2) lpsd = np.squeeze((lpsd * std * config.global_std) + mean) recon_speech = utils.get_recon(np.transpose(lpsd, (1, 0)), np.transpose( test_inphase, (1, 0)), win_size=config.win_size, win_step=config.win_step, fs=config.fs) test_dr.reader_initialize() break file_dir = self.save_dir + '/' + os.path.basename(wav_dir).replace( 'noisy', 'enhanced').replace('raw', 'wav') librosa.output.write_wav(file_dir, recon_speech, int(config.fs), norm=True) return recon_speech
def speech_enhance(wav_dir, graph_name): noisy_speech = utils.read_raw(wav_dir) temp_dir = './temp/temp.npy' np.save(temp_dir, noisy_speech) graph = gt.load_graph(graph_name) norm_path = os.path.abspath('./data/train/norm') test_dr = dr.DataReader(temp_dir, '', norm_path, dist_num=config.dist_num, is_training=False, is_shuffle=False) node_inputs = graph.get_tensor_by_name('prefix/model_1/inputs:0') node_labels = graph.get_tensor_by_name('prefix/model_1/labels:0') node_keep_prob = graph.get_tensor_by_name('prefix/model_1/keep_prob:0') node_prediction = graph.get_tensor_by_name('prefix/model_1/pred:0') pred = [] lab = [] sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess_config.gpu_options.allow_growth = True while True: test_inputs, test_labels = test_dr.next_batch(config.test_batch_size) feed_dict = {node_inputs: test_inputs, node_labels: test_labels, node_keep_prob: 1.0} with tf.Session(graph=graph, config=sess_config) as sess: pred_temp, lab_temp = sess.run([node_prediction, node_labels], feed_dict=feed_dict) pred.append(pred_temp) lab.append(lab_temp) # print(test_dr.file_change_checker()) if test_dr.file_change_checker(): print(wav_dir) phase = test_dr.phase[0] lpsd = np.expand_dims(np.reshape(np.concatenate(pred, axis=0), [-1, config.freq_size])[0:phase.shape[0], :], axis=2) mean, std = test_dr.norm_process(norm_path + '/norm_noisy.mat') lpsd = np.squeeze((lpsd * std) + mean) # denorm recon_speech = utils.get_recon(np.transpose(lpsd, (1, 0)), np.transpose(phase, (1, 0)), win_size=config.win_size, win_step=config.win_step, fs=config.fs) # plt.plot(recon_speech) # plt.show() # lab = np.reshape(np.asarray(lab), [-1, 1]) test_dr.reader_initialize() break return recon_speech
def do_summary(self, m_summary, sess, itr): valid_path = self.valid_path clean_speech = self.clean_speech clean_speech = utils.identity_trans(clean_speech) noisy_speech = self.noisy_speech noisy_speech = utils.identity_trans(noisy_speech) temp_dir = self.temp_dir name = self.name logs_dir = self.logs_dir writer = SummaryWriter(log_dir=self.logs_dir + '/summary') summary_dr = dr.DataReader(temp_dir, '', valid_path["norm_path"], dist_num=config.dist_num, is_training=False, is_shuffle=False) pred = [] while True: summary_inputs, summary_labels = summary_dr.next_batch(config.batch_size) feed_dict = {m_summary.inputs: summary_inputs, m_summary.labels: summary_labels, m_summary.keep_prob: 1.0} pred_temp = sess.run(m_summary.pred, feed_dict=feed_dict) pred.append(pred_temp) if summary_dr.file_change_checker(): phase = summary_dr.phase[0] lpsd = np.expand_dims( np.reshape(np.concatenate(pred, axis=0), [-1, config.freq_size])[0:phase.shape[0], :], axis=2) mean, std = summary_dr.norm_process(valid_path["norm_path"] + '/norm_noisy.mat') lpsd = np.squeeze((lpsd * std * 1.18) + mean) # denorm recon_speech = utils.get_recon(np.transpose(lpsd, (1, 0)), np.transpose(phase, (1, 0)), win_size=config.win_size, win_step=config.win_step, fs=config.fs) # plt.plot(recon_speech) # plt.show() # lab = np.reshape(np.asarray(lab), [-1, 1]) summary_dr.reader_initialize() break # write summary if itr == config.summary_step: writer.close() self.noisy_measure = utils.se_eval(clean_speech, np.squeeze(noisy_speech), float(config.fs)) summary_fname = tf.summary.text(name + '_filename', tf.convert_to_tensor(self.noisy_dir)) if name == 'train': config_str = "<br>sampling frequency: %d</br>" \ "<br>window step: %d ms</br>" \ "<br>window size: %d ms</br>" \ "<br>fft size: %d</br>" \ "<br>learning rate: %f</br><br>learning rate decay: %.4f</br><br>learning" \ " rate decay frequency: %.4d</br>" \ "<br>dropout rate: %.4f</br><br>max epoch:" \ " %.4e</br><br>batch size: %d</br><br>model type: %s</br>"\ % (config.fs, (config.win_step/config.fs*1000), (config.win_size/config.fs*1000), config.nfft, config.lr, config.lrDecayRate, config.lrDecayFreq, config.keep_prob, config.max_epoch, config.batch_size, config.mode) summary_config = tf.summary.text(name + '_configuration', tf.convert_to_tensor(config_str)) code_list = [] read_flag = False with open('./lib/trnmodel.py', 'r') as f: while True: line = f.readline() if "def inference(self, inputs):" in line: read_flag = True if "return fm" in line: code_list.append('<br>' + line.replace('\n', '') + '</br>') break if read_flag: code_list.append('<br>' + line.replace('\n', '') + '</br>') code_list = "<pre>" + "".join(code_list) + "</pre>" summary_model = tf.summary.text('train_model', tf.convert_to_tensor(code_list)) summary_op = tf.summary.merge([summary_fname, summary_config, summary_model]) else: summary_op = tf.summary.merge([summary_fname]) with tf.Session() as sess: summary_writer = tf.summary.FileWriter(logs_dir + '/summary/text') text = sess.run(summary_op) summary_writer.add_summary(text, 1) summary_writer.close() writer = SummaryWriter(log_dir=logs_dir + '/summary') writer.add_audio(name + '_audio_ref' + '/clean', clean_speech /np.max(np.abs(clean_speech)), itr, sample_rate=config.fs) writer.add_audio(name + '_audio_ref' + '/noisy', noisy_speech /np.max(np.abs(noisy_speech)), itr, sample_rate=config.fs) clean_S = get_spectrogram(clean_speech) noisy_S = get_spectrogram(noisy_speech) writer.add_image(name + '_spectrogram_ref' + '/clean', clean_S, itr) # image_shape = (C, H, W) writer.add_image(name + '_spectrogram_ref' + '/noisy', noisy_S, itr) # image_shape = (C, H, W) enhanced_measure = utils.se_eval(clean_speech, recon_speech, float(config.fs)) writer.add_scalars(name + '_speech_quality' + '/pesq', {'enhanced': enhanced_measure['pesq'], 'ref': self.noisy_measure['pesq']}, itr) writer.add_scalars(name + '_speech_quality' + '/stoi', {'enhanced': enhanced_measure['stoi'], 'ref': self.noisy_measure['stoi']}, itr) writer.add_scalars(name + '_speech_quality' + '/lsd', {'enhanced': enhanced_measure['lsd'], 'ref': self.noisy_measure['lsd']}, itr) writer.add_scalars(name + '_speech_quality' + '/ssnr', {'enhanced': enhanced_measure['ssnr'], 'ref': self.noisy_measure['ssnr']}, itr) writer.add_audio(name + '_audio_enhanced' + '/enhanced', recon_speech/np.max(np.abs(recon_speech)), itr, sample_rate=config.fs) enhanced_S = get_spectrogram(recon_speech) writer.add_image(name + '_spectrogram_enhanced' + '/enhanced', enhanced_S, itr) # image_shape = (C, H, W) writer.close()
def se_test(wav_dir, noise_dir, snr, noise_type=1): # clean_speech, clean_fs = librosa.load(wav_dir, config.fs) clean_speech = utils.read_raw(wav_dir) eng = matlab.engine.start_matlab() # noisy_speech = np.array(eng.noise_add(wav_dir, noise_dir, noise_type, snr, nargout=1)) # noisy_speech, noisy_fs = librosa.load(noise_dir, config.fs) noisy_speech = utils.read_raw(noise_dir) # noisy_measure = se_eval(clean_speech, np.squeeze(noisy_speech), float(config.fs)) temp_dir = './data/test/temp/temp.npy' np.save(temp_dir, noisy_speech) graph_name = sorted(glob.glob('./saved_model/*.pb'))[-1] graph = gt.load_graph(graph_name) norm_path = os.path.abspath('./data/train/norm') test_dr = dr.DataReader(temp_dir, '', norm_path, dist_num=config.dist_num, is_training=False, is_shuffle=False) node_inputs = graph.get_tensor_by_name('prefix/model_1/inputs:0') node_labels = graph.get_tensor_by_name('prefix/model_1/labels:0') node_keep_prob = graph.get_tensor_by_name('prefix/model_1/keep_prob:0') node_prediction = graph.get_tensor_by_name('prefix/model_1/pred:0') pred = [] lab = [] while True: test_inputs, test_labels = test_dr.next_batch(config.test_batch_size) feed_dict = { node_inputs: test_inputs, node_labels: test_labels, node_keep_prob: 1.0 } sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess_config.gpu_options.allow_growth = True with tf.Session(graph=graph, config=sess_config) as sess: pred_temp, lab_temp = sess.run([node_prediction, node_labels], feed_dict=feed_dict) pred.append(pred_temp) lab.append(lab_temp) if test_dr.file_change_checker(): phase = test_dr.phase[0] lpsd = np.expand_dims(np.reshape(np.concatenate( pred, axis=0), [-1, config.freq_size])[0:phase.shape[0], :], axis=2) mean, std = test_dr.norm_process(norm_path + '/norm_noisy.mat') lpsd = np.squeeze((lpsd * std) + mean) # denorm recon_speech = utils.get_recon(np.transpose(lpsd, (1, 0)), np.transpose(phase, (1, 0)), win_size=config.win_size, win_step=config.win_step, fs=config.fs) # plt.plot(recon_speech) # plt.show() # lab = np.reshape(np.asarray(lab), [-1, 1]) test_dr.reader_initialize() break # os.popen('rm -rf ' + temp_dir) noisy_measure = utils.se_eval( clean_speech[0:recon_speech.shape[0]], np.squeeze(noisy_speech[0:recon_speech.shape[0]]), float(config.fs)) enhanced_measure = utils.se_eval(clean_speech[0:recon_speech.shape[0]], recon_speech, float(config.fs)) print("pesq: %.4f -> %.4f" % (noisy_measure["pesq"], enhanced_measure["pesq"])) print("lsd: %.4f -> %.4f" % (noisy_measure["lsd"], enhanced_measure["lsd"])) print("stoi: %.4f -> %.4f" % (noisy_measure["stoi"], enhanced_measure["stoi"])) print("ssnr: %.4f -> %.4f" % (noisy_measure["ssnr"], enhanced_measure["ssnr"])) plt.subplot(3, 1, 1) S = librosa.amplitude_to_db(librosa.stft( clean_speech[0:recon_speech.shape[0]], hop_length=config.win_step, win_length=config.win_size, n_fft=config.nfft), ref=np.max) ld.specshow(S, y_axis='linear', hop_length=config.win_step, sr=config.fs) plt.subplot(3, 1, 2) S = librosa.amplitude_to_db(librosa.stft(np.squeeze( noisy_speech[0:recon_speech.shape[0]]), hop_length=config.win_step, win_length=config.win_size, n_fft=config.nfft), ref=np.max) ld.specshow(S, y_axis='linear', hop_length=config.win_step, sr=config.fs) plt.subplot(3, 1, 3) S = librosa.amplitude_to_db(librosa.stft(recon_speech, hop_length=config.win_step, win_length=config.win_size, n_fft=config.nfft), ref=np.max) ld.specshow(S, y_axis='linear', hop_length=config.win_step, sr=config.fs) plt.show() return recon_speech