num_workers=4, drop_last=True) for _, (clean_speech, noisy_speech) in enumerate(train_dataloader): clean_speech = torch.Tensor(clean_speech).to(device) noisy_speech = torch.Tensor(noisy_speech).to(device) ref_batch = torch.cat([clean_speech, noisy_speech], 1) break logger.info("Done.") logger.info("Start to construct model...") Generator, Discriminator = get_model(gen_model, dis_model) reconstruction_loss = get_recon_loss(reconstruction_loss) disc_g_loss, disc_d_loss = get_disc_loss(args.adversarial_loss) calc_target = get_target(args.target_type) generator = Generator(**model_opts['gen_model_opts']).to(device) g_optimizer = optim.Adam(generator.parameters(), lr=0.0001, betas=(0.5, 0.999)) # discriminator = ConvDiscriminator().to(device) if args.dist_alpha > 0 or args.feat_alpha > 0 or args.adversarial_loss is not None: discriminator = Discriminator( **model_opts['dis_model_opts']).to(device) d_optimizer = optim.Adam(discriminator.parameters(), lr=0.0001, betas=(0.5, 0.999)) if args.disc_step > 0: state_dict = torch.load("Checkpoints/GAN/%s/checkpoint_%09d.pth" % (args.disc_name, args.disc_step))
def calc_func(noisy_dir_path): with torch.no_grad(): debug_model = args.debug_model _method = method model_opts = json.load( open(os.path.join("configs/%s.json" % args.model_config), 'r')) gen_model = model_opts['gen_model_name'] calc_target = get_target(args.target_type) device = torch.device("cuda") print_with_time("Loading model...") Generator, _ = get_model(gen_model, None) model = Generator(model_opts['gen_model_opts']).to(device) checkpoint = torch.load("Checkpoints/%s/checkpoint_%09d.pth" % (_method, args.global_step)) model.load_state_dict(checkpoint["generator"]) # model.load_state_dict(checkpoint["enhancer"]) model.eval() melbank = get_fft_mel_mat(512, 16000, 40) _method = "_".join([_method, str(args.global_step)]) if debug_model: os.system('mkdir -p debug/%s' % _method) print_with_time("Start to enhance wav file in %s with method %s\n" % (noisy_dir_path, _method)) udir_path = "%s_%s" % (noisy_dir_path, _method) if not os.path.exists(udir_path): os.mkdir(udir_path) wav_scp = read_path_list(os.path.join(noisy_dir_path, "wav.scp")) if not debug_model: ark_file = open(os.path.join(udir_path, "feats.ark"), 'wb') scp_file = open(os.path.join(udir_path, "feats.scp"), 'w') key_len = wav_scp[0].find(' ') kaldi_holder = KaldiFeatHolder(key_len, 3000, 40) offset = key_len + 1 enhanced_number = 0 for it, (one_wav) in enumerate(wav_scp): wav_id, wav_path = one_wav.split(' ') sr, noisy_speech = wavfile.read(wav_path) if len(noisy_speech.shape) > 1: noisy_speech = np.mean(noisy_speech, 1) early50_path = wav_path.replace('.wav', '_early50.wav') sr, early50 = wavfile.read(early50_path) if len(early50.shape) > 1: early50 = np.mean(early50, 1) # as the training dataset, use "power_norm" to normalize the waveform to match the input of model. # c = np.sqrt(np.mean(np.square(noisy_speech))) c = calc_rescale_c(noisy_speech, args.rescale_method) noisy_speech = noisy_speech / c early50 = early50 / c noisy_fbank, noisy_mag = log_fbank(noisy_speech, False, True, True, None) early50_fbank, early50_mag = log_fbank(early50, False, True, True, None) noise_fbank, noise_mag = log_fbank(noisy_speech - early50, False, True, True, None) if args.feature_domain == "mel": feat = torch.Tensor(noisy_fbank.T).unsqueeze(0).to(device) label = torch.Tensor(early50_fbank.T).unsqueeze(0).to(device) noise = torch.Tensor(noise_fbank.T).unsqueeze(0).to(device) else: feat = torch.Tensor( np.square(noisy_mag).T).unsqueeze(0).to(device) label = torch.Tensor( np.square(early50_mag).T).unsqueeze(0).to(device) noise = torch.Tensor( np.square(noise_mag).T).unsqueeze(0).to(device) if args.target_type.lower() == "mapping_mag": predict = model.forward(feat.sqrt()) else: predict = model.forward(torch.log(feat + opts['eps'])) results = calc_target(feat, label, noise, predict, opts) enhanced = results["enhanced"] predict = results["predict"] target = results["target"] if args.feature_domain == "mel": enhanced_pow = 0 enhanced_fbank = enhanced[0, :, :].cpu().numpy() else: enhanced_pow = enhanced[0, :, :].cpu().numpy() enhanced_fbank = np.matmul(enhanced_pow, melbank.T) log_enhanced_fbank = np.log(enhanced_fbank * (c**2.) + opts['eps']) if debug_model: sio.savemat( "debug/%s/%s_%s" % (_method, wav_id, wav_path.split('/')[-5]), { 'noisy_mag': noisy_mag, 'noisy_fbank': noisy_fbank, 'enhanced_mag': np.sqrt(enhanced_pow).T, 'enhanced_fbank': enhanced_fbank.T, 'early50_mag': early50_mag, 'early50_fbank': early50_fbank, 'predict': predict[0, :, :].cpu().numpy().T, 'target': target[0, :, :].cpu().numpy().T, 'log_enhanced_fbank': log_enhanced_fbank.T, 'log_early50_fbank': np.log(early50_fbank * (c**2.) + opts['eps']), 'c': c }) if it >= 0: return else: kaldi_holder.set_key(wav_id) kaldi_holder.set_value(log_enhanced_fbank) kaldi_holder.write_to(ark_file) scp_file.write("%s %s/feats.ark:%d\n" % (wav_id, udir_path, offset)) offset += kaldi_holder.get_real_len() enhanced_number += 1 if enhanced_number % 40 == 0: print_with_time( "Enhanced %5d(%6.2f%%) utterance" % (enhanced_number, 100. * enhanced_number / len(wav_scp))) print_with_time("Enhanced %d utterance" % enhanced_number) ark_file.close() scp_file.close() post_process(noisy_dir_path, udir_path) print_with_time("Done %s." % _method)