def __convert_single_only_au_AutoVC_format_to_dataset__( self, filename, build_train_dataset=True): """ Convert a single file (only audio in AutoVC embedding format) to numpy arrays :param filename: :param is_map_to_std_face: :return: """ global_clip_index, video_name = filename # audio_file = os.path.join(self.src_dir, 'raw_wav', '{}.wav'. # format(video_name[:-4])) audio_file = os.path.join( self.src_dir, 'raw_wav', '{:05d}_{}_audio.wav'.format(global_clip_index, video_name[:-4])) if (not build_train_dataset): import shutil audio_file = os.path.join( self.src_dir, 'raw_wav', '{:05d}_{}_audio.wav'.format(global_clip_index, video_name[:-4])) shutil.copy( os.path.join(self.src_dir, 'test_wav_files', video_name), audio_file) sound = AudioSegment.from_file(audio_file, "wav") normalized_sound = match_target_amplitude(sound, -20.0) normalized_sound.export(audio_file, format='wav') from src.autovc.retrain_version.vocoder_spec.extract_f0_func import extract_f0_func_audiofile S, f0_norm = extract_f0_func_audiofile(audio_file, 'M') from src.autovc.utils import quantize_f0_interp f0_onehot = quantize_f0_interp(f0_norm) from thirdparty.resemblyer_util.speaker_emb import get_spk_emb mean_emb, _ = get_spk_emb(audio_file) return S, mean_emb, f0_onehot
'examples_cartoon/{}_face_close_mouth.txt'.format(DEMO_CH)) ''' STEP 3: Generate audio data as input to audio branch ''' au_data = [] au_emb = [] ains = glob.glob1('examples', '*.wav') ains = [item for item in ains if item is not 'tmp.wav'] ains.sort() for ain in ains: os.system( 'ffmpeg -y -loglevel error -i examples/{} -ar 16000 examples/tmp.wav'. format(ain)) shutil.copyfile('examples/tmp.wav', 'examples/{}'.format(ain)) # au embedding from thirdparty.resemblyer_util.speaker_emb import get_spk_emb me, ae = get_spk_emb('examples/{}'.format(ain)) au_emb.append(me.reshape(-1)) print('Processing audio file', ain) c = AutoVC_mel_Convertor('examples') au_data_i = c.convert_single_wav_to_autovc_input( audio_filename=os.path.join('examples', ain), autovc_model_path=opt_parser.load_AUTOVC_name) au_data += au_data_i # os.remove(os.path.join('examples', 'tmp.wav')) if (os.path.isfile('examples/tmp.wav')): os.remove('examples/tmp.wav') fl_data = [] rot_tran, rot_quat, anchor_t_shape = [], [], [] for au, info in au_data:
def convert_single_wav_to_autovc_input(self, audio_filename, autovc_model_path): def pad_seq(x, base=32): len_out = int(base * ceil(float(x.shape[0]) / base)) len_pad = len_out - x.shape[0] assert len_pad >= 0 return np.pad(x, ((0, len_pad), (0, 0)), 'constant'), len_pad device = torch.device("cuda" if torch.cuda.is_available() else "cpu") G = Generator(16, 256, 512, 16).eval().to(device) g_checkpoint = torch.load(autovc_model_path, map_location=device) G.load_state_dict(g_checkpoint['model']) emb = np.loadtxt('src/autovc/retrain_version/obama_emb.txt') emb_trg = torch.from_numpy( emb[np.newaxis, :].astype('float32')).to(device) aus = [] audio_file = audio_filename sound = AudioSegment.from_file(audio_file, "wav") normalized_sound = match_target_amplitude(sound, -20.0) normalized_sound.export(audio_file, format='wav') from src.autovc.retrain_version.vocoder_spec.extract_f0_func import extract_f0_func_audiofile x_real_src, f0_norm = extract_f0_func_audiofile(audio_file, 'F') from src.autovc.utils import quantize_f0_interp f0_org_src = quantize_f0_interp(f0_norm) from thirdparty.resemblyer_util.speaker_emb import get_spk_emb emb, _ = get_spk_emb(audio_file) ''' normal length version ''' # x_real, len_pad = pad_seq(x_real_src.astype('float32')) # f0_org, _ = pad_seq(f0_org_src.astype('float32')) # x_real = torch.from_numpy(x_real[np.newaxis, :].astype('float32')).to(device) # emb_org = torch.from_numpy(emb[np.newaxis, :].astype('float32')).to(device) # f0_org = torch.from_numpy(f0_org[np.newaxis, :].astype('float32')).to(device) # print('source shape:', x_real.shape, emb_org.shape, emb_trg.shape, f0_org.shape) # # with torch.no_grad(): # x_identic, x_identic_psnt, code_real = G(x_real, emb_org, f0_org, emb_trg, f0_org) # print('converted shape:', x_identic_psnt.shape, code_real.shape) ''' long split version ''' l = x_real_src.shape[0] x_identic_psnt = [] step = 4096 for i in range(0, l, step): x_real = x_real_src[i:i + step] f0_org = f0_org_src[i:i + step] x_real, len_pad = pad_seq(x_real.astype('float32')) f0_org, _ = pad_seq(f0_org.astype('float32')) x_real = torch.from_numpy( x_real[np.newaxis, :].astype('float32')).to(device) emb_org = torch.from_numpy( emb[np.newaxis, :].astype('float32')).to(device) # emb_trg = torch.from_numpy(emb[np.newaxis, :].astype('float32')).to(device) f0_org = torch.from_numpy( f0_org[np.newaxis, :].astype('float32')).to(device) print('source shape:', x_real.shape, emb_org.shape, emb_trg.shape, f0_org.shape) with torch.no_grad(): x_identic, x_identic_psnt_i, code_real = G( x_real, emb_org, f0_org, emb_trg, f0_org) x_identic_psnt.append(x_identic_psnt_i) x_identic_psnt = torch.cat(x_identic_psnt, dim=1) print('converted shape:', x_identic_psnt.shape, code_real.shape) if len_pad == 0: uttr_trg = x_identic_psnt[0, :, :].cpu().numpy() else: uttr_trg = x_identic_psnt[0, :-len_pad, :].cpu().numpy() aus.append((uttr_trg, (0, audio_filename, emb))) return aus