Ejemplo n.º 1
0
                                       num_workers=4,
                                       drop_last=True)
    for _, (clean_speech, noisy_speech) in enumerate(train_dataloader):
        clean_speech = torch.Tensor(clean_speech).to(device)
        noisy_speech = torch.Tensor(noisy_speech).to(device)
        ref_batch = torch.cat([clean_speech, noisy_speech], 1)
        break
    logger.info("Done.")

    logger.info("Start to construct model...")

    Generator, Discriminator = get_model(gen_model, dis_model)
    reconstruction_loss = get_recon_loss(reconstruction_loss)
    disc_g_loss, disc_d_loss = get_disc_loss(args.adversarial_loss)

    calc_target = get_target(args.target_type)

    generator = Generator(**model_opts['gen_model_opts']).to(device)
    g_optimizer = optim.Adam(generator.parameters(),
                             lr=0.0001,
                             betas=(0.5, 0.999))
    # discriminator = ConvDiscriminator().to(device)
    if args.dist_alpha > 0 or args.feat_alpha > 0 or args.adversarial_loss is not None:
        discriminator = Discriminator(
            **model_opts['dis_model_opts']).to(device)
        d_optimizer = optim.Adam(discriminator.parameters(),
                                 lr=0.0001,
                                 betas=(0.5, 0.999))
        if args.disc_step > 0:
            state_dict = torch.load("Checkpoints/GAN/%s/checkpoint_%09d.pth" %
                                    (args.disc_name, args.disc_step))
Ejemplo n.º 2
0
def calc_func(noisy_dir_path):
    with torch.no_grad():
        debug_model = args.debug_model
        _method = method
        model_opts = json.load(
            open(os.path.join("configs/%s.json" % args.model_config), 'r'))
        gen_model = model_opts['gen_model_name']
        calc_target = get_target(args.target_type)

        device = torch.device("cuda")
        print_with_time("Loading model...")
        Generator, _ = get_model(gen_model, None)
        model = Generator(model_opts['gen_model_opts']).to(device)

        checkpoint = torch.load("Checkpoints/%s/checkpoint_%09d.pth" %
                                (_method, args.global_step))
        model.load_state_dict(checkpoint["generator"])
        # model.load_state_dict(checkpoint["enhancer"])
        model.eval()
        melbank = get_fft_mel_mat(512, 16000, 40)

        _method = "_".join([_method, str(args.global_step)])
        if debug_model:
            os.system('mkdir -p debug/%s' % _method)
        print_with_time("Start to enhance wav file in %s with method %s\n" %
                        (noisy_dir_path, _method))
        udir_path = "%s_%s" % (noisy_dir_path, _method)
        if not os.path.exists(udir_path):
            os.mkdir(udir_path)
        wav_scp = read_path_list(os.path.join(noisy_dir_path, "wav.scp"))
        if not debug_model:
            ark_file = open(os.path.join(udir_path, "feats.ark"), 'wb')
            scp_file = open(os.path.join(udir_path, "feats.scp"), 'w')
            key_len = wav_scp[0].find(' ')
            kaldi_holder = KaldiFeatHolder(key_len, 3000, 40)
            offset = key_len + 1
        enhanced_number = 0
        for it, (one_wav) in enumerate(wav_scp):
            wav_id, wav_path = one_wav.split(' ')
            sr, noisy_speech = wavfile.read(wav_path)
            if len(noisy_speech.shape) > 1:
                noisy_speech = np.mean(noisy_speech, 1)

            early50_path = wav_path.replace('.wav', '_early50.wav')
            sr, early50 = wavfile.read(early50_path)
            if len(early50.shape) > 1:
                early50 = np.mean(early50, 1)
            # as the training dataset, use "power_norm" to normalize the waveform to match the input of model.
            # c = np.sqrt(np.mean(np.square(noisy_speech)))
            c = calc_rescale_c(noisy_speech, args.rescale_method)
            noisy_speech = noisy_speech / c
            early50 = early50 / c

            noisy_fbank, noisy_mag = log_fbank(noisy_speech, False, True, True,
                                               None)
            early50_fbank, early50_mag = log_fbank(early50, False, True, True,
                                                   None)
            noise_fbank, noise_mag = log_fbank(noisy_speech - early50, False,
                                               True, True, None)
            if args.feature_domain == "mel":
                feat = torch.Tensor(noisy_fbank.T).unsqueeze(0).to(device)
                label = torch.Tensor(early50_fbank.T).unsqueeze(0).to(device)
                noise = torch.Tensor(noise_fbank.T).unsqueeze(0).to(device)
            else:
                feat = torch.Tensor(
                    np.square(noisy_mag).T).unsqueeze(0).to(device)
                label = torch.Tensor(
                    np.square(early50_mag).T).unsqueeze(0).to(device)
                noise = torch.Tensor(
                    np.square(noise_mag).T).unsqueeze(0).to(device)

            if args.target_type.lower() == "mapping_mag":
                predict = model.forward(feat.sqrt())
            else:
                predict = model.forward(torch.log(feat + opts['eps']))

            results = calc_target(feat, label, noise, predict, opts)
            enhanced = results["enhanced"]
            predict = results["predict"]
            target = results["target"]

            if args.feature_domain == "mel":
                enhanced_pow = 0
                enhanced_fbank = enhanced[0, :, :].cpu().numpy()
            else:
                enhanced_pow = enhanced[0, :, :].cpu().numpy()
                enhanced_fbank = np.matmul(enhanced_pow, melbank.T)

            log_enhanced_fbank = np.log(enhanced_fbank * (c**2.) + opts['eps'])

            if debug_model:
                sio.savemat(
                    "debug/%s/%s_%s" %
                    (_method, wav_id, wav_path.split('/')[-5]), {
                        'noisy_mag':
                        noisy_mag,
                        'noisy_fbank':
                        noisy_fbank,
                        'enhanced_mag':
                        np.sqrt(enhanced_pow).T,
                        'enhanced_fbank':
                        enhanced_fbank.T,
                        'early50_mag':
                        early50_mag,
                        'early50_fbank':
                        early50_fbank,
                        'predict':
                        predict[0, :, :].cpu().numpy().T,
                        'target':
                        target[0, :, :].cpu().numpy().T,
                        'log_enhanced_fbank':
                        log_enhanced_fbank.T,
                        'log_early50_fbank':
                        np.log(early50_fbank * (c**2.) + opts['eps']),
                        'c':
                        c
                    })
                if it >= 0:
                    return
            else:
                kaldi_holder.set_key(wav_id)
                kaldi_holder.set_value(log_enhanced_fbank)
                kaldi_holder.write_to(ark_file)
                scp_file.write("%s %s/feats.ark:%d\n" %
                               (wav_id, udir_path, offset))
                offset += kaldi_holder.get_real_len()

            enhanced_number += 1
            if enhanced_number % 40 == 0:
                print_with_time(
                    "Enhanced %5d(%6.2f%%) utterance" %
                    (enhanced_number, 100. * enhanced_number / len(wav_scp)))
        print_with_time("Enhanced %d utterance" % enhanced_number)
        ark_file.close()
        scp_file.close()
        post_process(noisy_dir_path, udir_path)
        print_with_time("Done %s." % _method)