def calc_func(noisy_dir_path): with torch.no_grad(): debug_model = args.debug_model _method = method model_opts = json.load( open(os.path.join("configs/%s.json" % args.model_config), 'r')) gen_model = model_opts['gen_model_name'] calc_target = get_target(args.target_type) device = torch.device("cuda") print_with_time("Loading model...") Generator, _ = get_model(gen_model, None) model = Generator(model_opts['gen_model_opts']).to(device) checkpoint = torch.load("Checkpoints/%s/checkpoint_%09d.pth" % (_method, args.global_step)) model.load_state_dict(checkpoint["generator"]) # model.load_state_dict(checkpoint["enhancer"]) model.eval() melbank = get_fft_mel_mat(512, 16000, 40) _method = "_".join([_method, str(args.global_step)]) if debug_model: os.system('mkdir -p debug/%s' % _method) print_with_time("Start to enhance wav file in %s with method %s\n" % (noisy_dir_path, _method)) udir_path = "%s_%s" % (noisy_dir_path, _method) if not os.path.exists(udir_path): os.mkdir(udir_path) wav_scp = read_path_list(os.path.join(noisy_dir_path, "wav.scp")) if not debug_model: ark_file = open(os.path.join(udir_path, "feats.ark"), 'wb') scp_file = open(os.path.join(udir_path, "feats.scp"), 'w') key_len = wav_scp[0].find(' ') kaldi_holder = KaldiFeatHolder(key_len, 3000, 40) offset = key_len + 1 enhanced_number = 0 for it, (one_wav) in enumerate(wav_scp): wav_id, wav_path = one_wav.split(' ') sr, noisy_speech = wavfile.read(wav_path) if len(noisy_speech.shape) > 1: noisy_speech = np.mean(noisy_speech, 1) early50_path = wav_path.replace('.wav', '_early50.wav') sr, early50 = wavfile.read(early50_path) if len(early50.shape) > 1: early50 = np.mean(early50, 1) # as the training dataset, use "power_norm" to normalize the waveform to match the input of model. # c = np.sqrt(np.mean(np.square(noisy_speech))) c = calc_rescale_c(noisy_speech, args.rescale_method) noisy_speech = noisy_speech / c early50 = early50 / c noisy_fbank, noisy_mag = log_fbank(noisy_speech, False, True, True, None) early50_fbank, early50_mag = log_fbank(early50, False, True, True, None) noise_fbank, noise_mag = log_fbank(noisy_speech - early50, False, True, True, None) if args.feature_domain == "mel": feat = torch.Tensor(noisy_fbank.T).unsqueeze(0).to(device) label = torch.Tensor(early50_fbank.T).unsqueeze(0).to(device) noise = torch.Tensor(noise_fbank.T).unsqueeze(0).to(device) else: feat = torch.Tensor( np.square(noisy_mag).T).unsqueeze(0).to(device) label = torch.Tensor( np.square(early50_mag).T).unsqueeze(0).to(device) noise = torch.Tensor( np.square(noise_mag).T).unsqueeze(0).to(device) if args.target_type.lower() == "mapping_mag": predict = model.forward(feat.sqrt()) else: predict = model.forward(torch.log(feat + opts['eps'])) results = calc_target(feat, label, noise, predict, opts) enhanced = results["enhanced"] predict = results["predict"] target = results["target"] if args.feature_domain == "mel": enhanced_pow = 0 enhanced_fbank = enhanced[0, :, :].cpu().numpy() else: enhanced_pow = enhanced[0, :, :].cpu().numpy() enhanced_fbank = np.matmul(enhanced_pow, melbank.T) log_enhanced_fbank = np.log(enhanced_fbank * (c**2.) + opts['eps']) if debug_model: sio.savemat( "debug/%s/%s_%s" % (_method, wav_id, wav_path.split('/')[-5]), { 'noisy_mag': noisy_mag, 'noisy_fbank': noisy_fbank, 'enhanced_mag': np.sqrt(enhanced_pow).T, 'enhanced_fbank': enhanced_fbank.T, 'early50_mag': early50_mag, 'early50_fbank': early50_fbank, 'predict': predict[0, :, :].cpu().numpy().T, 'target': target[0, :, :].cpu().numpy().T, 'log_enhanced_fbank': log_enhanced_fbank.T, 'log_early50_fbank': np.log(early50_fbank * (c**2.) + opts['eps']), 'c': c }) if it >= 0: return else: kaldi_holder.set_key(wav_id) kaldi_holder.set_value(log_enhanced_fbank) kaldi_holder.write_to(ark_file) scp_file.write("%s %s/feats.ark:%d\n" % (wav_id, udir_path, offset)) offset += kaldi_holder.get_real_len() enhanced_number += 1 if enhanced_number % 40 == 0: print_with_time( "Enhanced %5d(%6.2f%%) utterance" % (enhanced_number, 100. * enhanced_number / len(wav_scp))) print_with_time("Enhanced %d utterance" % enhanced_number) ark_file.close() scp_file.close() post_process(noisy_dir_path, udir_path) print_with_time("Done %s." % _method)
def calc_func(noisy_dir_path): debug_model = True # nn.Module.dump_patches = True melbank = get_fft_mel_mat(512, 16000, 40) method = "Tan2018CRN_mag_early50" if debug_model: os.system('mkdir -p debug/%s' % method) device = torch.device("cuda") print_with_time("Loading model...") model = Generator(64, 256).to(device) checkpoint = torch.load( "Checkpoints/Tan2018CRN_mag_early50/checkpoint_000096336.pth") model.load_state_dict(checkpoint["generator"]) model.eval() print_with_time("Start to enhance wav file in %s with method %s\n" % (noisy_dir_path, method)) udir_path = "%s_%s" % (noisy_dir_path, method) if not os.path.exists(udir_path): os.mkdir(udir_path) wav_scp = read_path_list(os.path.join(noisy_dir_path, "wav.scp")) if not debug_model: ark_file = open(os.path.join(udir_path, "feats.ark"), 'wb') scp_file = open(os.path.join(udir_path, "feats.scp"), 'w') key_len = wav_scp[0].find(' ') kaldi_holder = KaldiFeatHolder(key_len, 2000, 40) offset = key_len + 1 enhanced_number = 0 left_frame = 0 right_frame = 0 for it, (one_wav) in enumerate(wav_scp): wav_id, wav_path = one_wav.split(' ') sr, noisy_speech = wavfile.read(wav_path) # process binaural waves. if len(noisy_speech.shape) > 1: noisy_speech = np.mean(noisy_speech, 1) c = np.sqrt(np.mean(np.square(noisy_speech))) noisy_speech = noisy_speech / c n_noisy_feat, n_noisy_mag = log_fbank(noisy_speech, False, True, True, None) # n_log_noisy_power = np.log(n_noisy_mag ** 2 + 1e-12) # feat = torch.Tensor(n_log_noisy_power).to(device) feat = torch.Tensor(n_noisy_mag.T).unsqueeze(0).to(device) n, t, d = feat.size() if left_frame > 0 or right_frame > 0: pad_feats = F.pad(feat.unsqueeze(1), (0, 0, left_frame, right_frame)).squeeze(1) ex_list = [ pad_feats[:, i:i + t, :] for i in range(left_frame + 1 + right_frame) ] feat = torch.cat(ex_list, 2) with torch.no_grad(): # mask = torch.sigmoid(model.forward(feat)) # enhanced = mask * feat.pow(2.) enhanced_mag = model.forward(feat) enhanced = enhanced_mag.pow(2.) enhanced = (enhanced[0, :, :] * c**2.).cpu().numpy() enhanced_fbank = np.matmul(melbank, enhanced.T) log_enhanced_fbank = np.log(enhanced_fbank + 1e-12) if debug_model: early50_path = wav_path.replace('.wav', '_early50.wav') sr, early50 = wavfile.read(early50_path) if len(early50.shape) > 1: early50 = np.mean(early50, 1) early50 = early50 / c early50_feat, early50_mag = log_fbank(early50, False, True, True, None) sio.savemat( "debug/%s/%s_%s" % (method, wav_id, wav_path.split('/')[-5]), { 'noisy_mag': n_noisy_mag, 'noisy_feat': n_noisy_feat, 'enhanced_mag': enhanced_mag[0, :, :].cpu().numpy().T, 'enhanced_feat': enhanced_fbank, 'early50_mag': early50_mag, 'early50_feat': early50_feat, }) if it >= 5: return else: kaldi_holder.set_key(wav_id) kaldi_holder.set_value(log_enhanced_fbank.T) kaldi_holder.write_to(ark_file) scp_file.write("%s %s/feats.ark:%d\n" % (wav_id, udir_path, offset)) offset += kaldi_holder.get_real_len() enhanced_number += 1 if enhanced_number % 40 == 0: print_with_time( "Enhanced %5d(%6.2f%%) utterance" % (enhanced_number, 100. * enhanced_number / len(wav_scp))) print_with_time("Enhanced %d utterance" % enhanced_number) ark_file.close() scp_file.close() post_process(noisy_dir_path, udir_path)
def calc_func(noisy_dir_path): # nn.Module.dump_patches = True melbank = get_fft_mel_mat(512, 16000, 40) method = "Tan2018CRN_mag_early50" device = torch.device("cuda") print_with_time("Loading model...") model = Generator(64, 256).to(device) checkpoint = torch.load("Checkpoints/Tan2018CRN_mag_early50/checkpoint_000096336.pth") model.load_state_dict(checkpoint["generator"]) model.eval() print_with_time("Start to enhance wav file in %s with method %s\n" % (noisy_dir_path, method)) udir_path = "%s_%s" % (noisy_dir_path, method) if not os.path.exists(udir_path): os.mkdir(udir_path) wav_scp = read_path_list(os.path.join(noisy_dir_path, "wav.scp")) ark_file = open(os.path.join(udir_path, "feats.ark"), 'wb') scp_file = open(os.path.join(udir_path, "feats.scp"), 'w') key_len = wav_scp[0].find(' ') kaldi_holder = KaldiFeatHolder(key_len, 2000, 40) offset = key_len + 1 enhanced_number = 0 left_frame = 0 right_frame = 0 for one_wav in wav_scp: wav_id, wav_path = one_wav.split(' ') # wav_path = one_wav # if "dB" in wav_path: # wav_path = wav_path.replace(".wav", "_early50.wav") sr, noisy_speech = wavfile.read(wav_path) # sr, early50_speech = wavfile.read(wav_path.replace(".wav", "_early50.wav")) # sr, direct_speech = wavfile.read(wav_path.replace(".wav", "_direct_sound.wav")) # process binaural waves. if len(noisy_speech.shape) > 1: noisy_speech = np.mean(noisy_speech, 1) # if len(early50_speech.shape) > 1: # early50_speech = np.mean(early50_speech, 1) # if len(direct_speech.shape) > 1: # direct_speech = np.mean(direct_speech, 1) # noisy_speech = noisy_speech.astype(np.int16) # librosa.output.write_wav("/home/duzhihao/440c0201_mono.wav", noisy_speech.astype(np.float32), 16000, True) c = np.sqrt(np.mean(np.square(noisy_speech))) noisy_speech = noisy_speech / c # early50_speech = early50_speech / c n_noisy_feat, n_noisy_mag = log_fbank(noisy_speech, False, True, True, None) # n_noise_feat, n_noise_mag = log_fbank(noisy_speech - early50_speech, False, True, True, None) # n_early50_feat, n_early50_mag = log_fbank(early50_speech, False, True, True, None) # n_irm = n_early50_feat[0] / (n_early50_feat[0] + n_noise_feat[0]) log_noisy_power = np.log(n_noisy_mag ** 2 + 1e-12) # log_early50_power = np.log(n_early50_mag ** 2 + 1e-12) # log_mask = np.clip((log_early50_power+10) / (log_noisy_power+10), 0, 1) # log_enhanced_power = (log_noisy_power+10) * log_mask - 10 # enhanced_power = np.exp(log_enhanced_power) # n_direct_feat = log_fbank(direct_speech, False, True, False, None) # n_noisy_feat = log_fbank_for_wu(noisy_speech, False, True, False, None) # log_noisy_feat = np.log(n_noisy_feat[0].T) # log_noisy_feat[np.isnan(log_noisy_feat)] = 0. # log_noisy_feat[np.isinf(log_noisy_feat)] = 0. # log_noisy_feat = log_noisy_feat[np.newaxis, :, :] # log_noisy_feat = torch.Tensor(log_noisy_feat).to(device) # n, t, d = log_noisy_feat.size() # if left_frame > 0 or right_frame > 0: # pad_feats = F.pad(log_noisy_feat.unsqueeze(1), (0, 0, left_frame, right_frame)).squeeze(1) # ex_list = [pad_feats[:, i:i+t, :] for i in range(left_frame+1+right_frame)] # log_noisy_feat = torch.cat(ex_list, 2) # with torch.no_grad(): # feat_ex_list = [log_noisy_feat[:, :, i * 40:(i + 1) * 40].unsqueeze(1) for i in range(left_frame + 1 + right_frame)] # nn_input = torch.cat(feat_ex_list, 1) # mask = torch.sigmoid(model.forward(log_noisy_feat)) # mask = model.forward(log_noisy_feat) # enhanced_feat = mask[0, :, :].cpu().numpy() * n_noisy_feat[0].T # log_enhanced_feat = np.log(enhanced_feat) # log_enhanced_feat = log_noisy_feat[:, :, 40*left_frame:40*(left_frame+1)] # * mask # log_enhanced_feat = model.forward(log_noisy_feat) # log_enhanced_feat = log_enhanced_feat * (log_pow_max - log_pow_min) + log_pow_min # log_enhanced_feat = log_enhanced_feat.cpu().numpy() sio.savemat(method + "_chime2", {'noisy_mag': n_noisy_mag, 'irm': log_mask, 'early50_mag': n_early50_mag, 'enhanced_mag': np.sqrt(enhanced_power), # 'direct_feat': n_direct_feat[0] }) return kaldi_holder.set_key(wav_id) # kaldi_holder.set_value(log_enhanced_feat[0, :, :].cpu().numpy()) # kaldi_holder.set_value(np.log((n_early50_feat[0]).T + 1e-8)) kaldi_holder.set_value(np.log((n_noisy_feat[0] * n_irm).T + 1e-8)) kaldi_holder.write_to(ark_file) scp_file.write("%s %s/feats.ark:%d\n" % (wav_id, udir_path, offset)) offset += kaldi_holder.get_real_len() enhanced_number += 1 if enhanced_number % 40 == 0: print_with_time( "Enhanced %5d(%6.2f%%) utterance" % (enhanced_number, 100. * enhanced_number / len(wav_scp))) print_with_time("Enhanced %d utterance" % enhanced_number) ark_file.close() scp_file.close() post_process(noisy_dir_path, udir_path)