def main(args, hp): model = VoiceFilter(hp).cuda() chkpt_model = torch.load(args.checkpoint_path)['model'] model.load_state_dict(chkpt_model) model.eval() embedder = SpeechEmbedder(hp).cuda() chkpt_embed = torch.load(args.embedder_path) embedder.load_state_dict(chkpt_embed) embedder.eval() audio = Audio(hp) dvec_wav, _ = librosa.load(args.reference_file, sr=16000) dvec_mel = audio.get_mel(dvec_wav) dvec_mel = torch.from_numpy(dvec_mel).float().cuda() dvec = embedder(dvec_mel) dvec = dvec.unsqueeze(0) mixed_wav, _ = librosa.load(args.mixed_file, sr=16000) mag, phase = audio.wav2spec(mixed_wav) mag = torch.from_numpy(mag).float().cuda() mag = mag.unsqueeze(0) mask = model(mag, dvec) est_mag = mag * mask est_mag = est_mag[0].cpu().detach().numpy() est_wav = audio.spec2wav(est_mag, phase) os.makedirs(args.out_dir, exist_ok=True) out_path = os.path.join(args.out_dir, 'result.wav') librosa.output.write_wav(out_path, est_wav, sr=16000)
def main(args, hp): with torch.no_grad(): model = VoiceFilter(hp).cuda() chkpt_model = torch.load(args.checkpoint_path)['model'] model.load_state_dict(chkpt_model) model.eval() embedder = SpeechEmbedder(hp).cuda() chkpt_embed = torch.load(args.embedder_path) embedder.load_state_dict(chkpt_embed) embedder.eval() audio = Audio(hp) ref_wav, _ = librosa.load(args.reference_file, sr=16000) ref_mel = audio.get_mel(ref_wav) ref_mel = torch.from_numpy(ref_mel).float().cuda() dvec = embedder(ref_mel) dvec = dvec.unsqueeze(0) mixed_wav, _ = librosa.load(args.mixed_file, sr=16000) mixed_mag, mixed_phase = audio.wav2spec(mixed_wav) mixed_mag = torch.from_numpy(mixed_mag).float().cuda() mixed_mag = mixed_mag.unsqueeze(0) shadow_mag = model(mixed_mag, dvec) shadow_mag = shadow_mag[0].cpu().detach().numpy() recorded_mag = tensor_normalize(mixed_mag + shadow_mag) recorded_mag = recorded_mag[0].cpu().detach().numpy() recorded_wav = audio.spec2wav(recorded_mag, mixed_mag) os.makedirs(args.out_dir, exist_ok=True) out_path = os.path.join(args.out_dir, 'result.wav') librosa.output.write_wav(out_path, recorded_wav, sr=16000)
def main(args): args = { "config": 'config/config.yaml', "embedder_path": 'model/embedder.pt', "checkpoint_path": 'enhance_my_voice/chkpt_201000.pt', "mixed_file": 'utils/speakerA.wav', "reference_file": 'utils/speakerA.wav', "out_dir": 'output', } hp = HParam(args['config']) with torch.no_grad(): model = VoiceFilter(hp).cuda() chkpt_model = torch.load(args['checkpoint_path'])['model'] model.load_state_dict(chkpt_model) model.eval() embedder = SpeechEmbedder(hp).cuda() chkpt_embed = torch.load(args['embedder_path']) embedder.load_state_dict(chkpt_embed) embedder.eval() audio = Audio(hp) dvec_wav, _ = librosa.load(args['reference_file'], sr=16000) dvec_mel = audio.get_mel(dvec_wav) dvec_mel = torch.from_numpy(dvec_mel).float().cuda() dvec = embedder(dvec_mel) dvec = dvec.unsqueeze(0) mixed_wav, _ = librosa.load(args['mixed_file'], sr=16000) mag, phase = audio.wav2spec(mixed_wav) mag = torch.from_numpy(mag).float().cuda() mag = mag.unsqueeze(0) mask = model(mag, dvec) est_mag = mag * mask est_mag = est_mag[0].cpu().detach().numpy() # est_wav = audio.spec2wav(est_mag, phase) # os.makedirs(args['out_dir'], exist_ok=True) # out_path = os.path.join(args['out_dir'], 'result.wav') # librosa.output.write_wav(out_path, est_wav, sr=16000) return audio.spec2wav(est_mag, phase)
def main(args, hp): with open('out1.txt') as f: for line in f: res = line.split('\t') with torch.no_grad(): model = VoiceFilter(hp) chkpt_model = torch.load(args.checkpoint_path, map_location='cpu')['model'] model.load_state_dict(chkpt_model) model.eval() embedder = SpeechEmbedder(hp) chkpt_embed = torch.load(args.embedder_path, map_location='cpu') embedder.load_state_dict(chkpt_embed) embedder.eval() audio = Audio(hp) dvec_wav, _ = librosa.load(res[1], sr=16000) dvec_mel = audio.get_mel(dvec_wav) dvec_mel = torch.from_numpy(dvec_mel).float() dvec = embedder(dvec_mel) dvec = dvec.unsqueeze(0) mixed_wav, _ = librosa.load(res[0], sr=16000) mag, phase = audio.wav2spec(mixed_wav) mag = torch.from_numpy(mag).float() mag = mag.unsqueeze(0) mask = model(mag, dvec) est_mag = mag * mask est_mag = est_mag[0].cpu().detach().numpy() est_wav = audio.spec2wav(est_mag, phase) os.makedirs('/root/voicefilter/res', exist_ok=True) out_path = os.path.join('/root/voicefilter/res', f'{res[2]}') librosa.output.write_wav(out_path, est_wav, sr=16000)
mixed_wav, _ = librosa.load(mixed_wav_path, sr=16000) mixed_mag, mixed_phase = audio.wav2spec(mixed_wav) mixed_mag = torch.from_numpy(mixed_mag).float().cuda() mixed_mag = mixed_mag.unsqueeze(0) shadow_mag = model(mixed_mag, dvec) # shadow_mag.size() = [1, 301, 601] recorded_mag = tensor_normalize(mixed_mag + shadow_mag) recorded_mag = recorded_mag[0].cpu().detach().numpy() mixed_mag = mixed_mag[0].cpu().detach().numpy() shadow_mag = shadow_mag[0].cpu().detach().numpy() shadow_wav = audio.spec2wav(shadow_mag, mixed_phase) # scale is frequency pass to time domain, used on wav signal normalization recorded_wav1 = audio.spec2wav(recorded_mag, mixed_phase) # path 1 # mixed_Wav_path = '/data/our_dataset/test/13/babble/000001-mixed.wav' hide1 = mixed_wav_path[:-9] + 'hide1.wav' hide2 = mixed_wav_path[:-9] + 'hide2.wav' # purified3 = os.path.join(args.out_dir, 'result3.wav') # original mixed wav and expected_focused wav are not PCM, cannot be read by google cloud wavio.write(hide1, recorded_wav1, 16000, sampwidth=2) # frequency + wavio.write(hide2, shadow_wav, 16000, sampwidth=2) # est noise # wavio.write(purified3, enhanced_wav, 16000, sampwidth=2) # mix + est noise