def test(): encoder = AT_net().cuda() state_dict = multi2single(config.at_model, 1) encoder.load_state_dict(state_dict) encoder.eval() input_audio = glob.glob(os.path.join(config.in_file, 'audio/*'))[0] speech, sr = librosa.load(input_audio, sr=16000, mono=True) speech = np.insert(speech, 0, np.zeros(1920)) speech = np.append(speech, np.zeros(1920)) mfcc = python_speech_features.mfcc(speech,16000,winstep=0.01) ind = 3 with torch.no_grad(): input_mfcc = [] while ind <= int(mfcc.shape[0]/4) - 4: t_mfcc =mfcc[( ind - 3) * 4: (ind + 4) * 4, 1:] t_mfcc = torch.FloatTensor(t_mfcc).cuda() input_mfcc.append(t_mfcc) ind += 1 input_mfcc = torch.stack(input_mfcc,dim = 0) input_mfcc = input_mfcc.unsqueeze(0) features = encoder(input_mfcc) features_lengh = (len(features) // 25) * 25 for i in tqdm(range(features_lengh)): torch.save(features[i][0].detach().cpu(), os.path.join(config.in_file, 'feature/%05d.pt' % (i+1))) print('feature length:',features_lengh)
def __init__(self, config): if config.lstm == True: self.generator = AT_net() else: self.generator = AT_single() self.l1_loss_fn = nn.L1Loss() self.mse_loss_fn = nn.MSELoss() self.config = config if config.cuda: device_ids = [int(i) for i in config.device_ids.split(',')] self.generator = nn.DataParallel(self.generator, device_ids=device_ids).cuda() # self.generator = self.generator.cuda() self.mse_loss_fn = self.mse_loss_fn.cuda() self.l1_loss_fn = self.l1_loss_fn.cuda() # #########single GPU####################### # if config.cuda: # device_ids = [int(i) for i in config.device_ids.split(',')] # self.generator = self.generator.cuda() # self.encoder = self.encoder.cuda() # self.mse_loss_fn = self.mse_loss_fn.cuda() # self.l1_loss_fn = nn.L1Loss().cuda() initialize_weights(self.generator) self.start_epoch = 0 if config.load_model: self.start_epoch = config.start_epoch self.load(config.pretrained_dir, config.pretrained_epoch) self.opt_g = torch.optim.Adam( self.generator.parameters(), lr=config.lr, betas=(config.beta1, config.beta2)) if config.lstm: if config.dataset == 'lrw': self.dataset = LRW_1D_lstm_landmark_pca(config.dataset_dir, train=config.is_train) else: self.dataset = GRID_1D_lstm_landmark_pca(config.dataset_dir, train=config.is_train) else: if config.dataset == 'lrw': self.dataset = LRW_1D_single_landmark_pca(config.dataset_dir, train=config.is_train) else: self.dataset = GRID_1D_single_landmark_pca(config.dataset_dir, train=config.is_train) # elif config.dataset == 'ldc': # self.dataset = LDCDataset(config.dataset_dir, train=config.is_train) self.data_loader = DataLoader(self.dataset, batch_size=config.batch_size, num_workers=config.num_thread, shuffle=True, drop_last=True)
def test(): os.environ["CUDA_VISIBLE_DEVICES"] = config.device_ids if os.path.exists('../temp'): shutil.rmtree('../temp') os.mkdir('../temp') os.mkdir('../temp/img') os.mkdir('../temp/motion') os.mkdir('../temp/attention') pca = torch.FloatTensor( np.load('../basics/U_lrw1.npy')[:,:6]).cuda() mean =torch.FloatTensor( np.load('../basics/mean_lrw1.npy')).cuda() decoder = VG_net() encoder = AT_net() if config.cuda: encoder = encoder.cuda() decoder = decoder.cuda() state_dict2 = multi2single(config.vg_model, 1) # state_dict2 = torch.load(config.video_model, map_location=lambda storage, loc: storage) decoder.load_state_dict(state_dict2) state_dict = multi2single(config.at_model, 1) encoder.load_state_dict(state_dict) encoder.eval() decoder.eval() test_file = config.in_file example_image, example_landmark = generator_demo_example_lips( config.person) transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) ]) example_image = cv2.cvtColor(example_image, cv2.COLOR_BGR2RGB) example_image = transform(example_image) example_landmark = example_landmark.reshape((1,example_landmark.shape[0]* example_landmark.shape[1])) if config.cuda: example_image = Variable(example_image.view(1,3,128,128)).cuda() example_landmark = Variable(torch.FloatTensor(example_landmark.astype(float)) ).cuda() else: example_image = Variable(example_image.view(1,3,128,128)) example_landmark = Variable(torch.FloatTensor(example_landmark.astype(float))) # Load speech and extract features example_landmark = example_landmark * 5.0 example_landmark = example_landmark - mean.expand_as(example_landmark) example_landmark = torch.mm(example_landmark, pca) speech, sr = librosa.load(test_file, sr=16000) mfcc = python_speech_features.mfcc(speech ,16000,winstep=0.01) speech = np.insert(speech, 0, np.zeros(1920)) speech = np.append(speech, np.zeros(1920)) mfcc = python_speech_features.mfcc(speech,16000,winstep=0.01) sound, _ = librosa.load(test_file, sr=44100) print ('=======================================') print ('Start to generate images') t =time.time() ind = 3 with torch.no_grad(): fake_lmark = [] input_mfcc = [] while ind <= int(mfcc.shape[0]/4) - 4: t_mfcc =mfcc[( ind - 3)*4: (ind + 4)*4, 1:] t_mfcc = torch.FloatTensor(t_mfcc).cuda() input_mfcc.append(t_mfcc) ind += 1 input_mfcc = torch.stack(input_mfcc,dim = 0) input_mfcc = input_mfcc.unsqueeze(0) fake_lmark = encoder(example_landmark, input_mfcc) fake_lmark = fake_lmark.view(fake_lmark.size(0) *fake_lmark.size(1) , 6) example_landmark = torch.mm( example_landmark, pca.t() ) example_landmark = example_landmark + mean.expand_as(example_landmark) fake_lmark[:, 1:6] *= 2*torch.FloatTensor(np.array([1.1, 1.2, 1.3, 1.4, 1.5])).cuda() fake_lmark = torch.mm( fake_lmark, pca.t() ) fake_lmark = fake_lmark + mean.expand_as(fake_lmark) fake_lmark = fake_lmark.unsqueeze(0) fake_ims, atts ,ms ,_ = decoder(example_image, fake_lmark, example_landmark ) for indx in range(fake_ims.size(1)): fake_im = fake_ims[:,indx] fake_store = fake_im.permute(0,2,3,1).data.cpu().numpy()[0] scipy.misc.imsave("{}/{:05d}.png".format(os.path.join('../', 'temp', 'img') ,indx ), fake_store) m = ms[:,indx] att = atts[:,indx] m = m.permute(0,2,3,1).data.cpu().numpy()[0] att = att.data.cpu().numpy()[0,0] scipy.misc.imsave("{}/{:05d}.png".format(os.path.join('../', 'temp', 'motion' ) ,indx ), m) scipy.misc.imsave("{}/{:05d}.png".format(os.path.join('../', 'temp', 'attention') ,indx ), att) print ( 'In total, generate {:d} images, cost time: {:03f} seconds'.format(fake_ims.size(1), time.time() - t) ) fake_lmark = fake_lmark.data.cpu().numpy() np.save( os.path.join( config.sample_dir, 'obama_fake.npy'), fake_lmark) fake_lmark = np.reshape(fake_lmark, (fake_lmark.shape[1], 68, 2)) utils.write_video_wpts_wsound(fake_lmark, sound, 44100, config.sample_dir, 'fake', [-1.0, 1.0], [-1.0, 1.0]) video_name = os.path.join(config.sample_dir , 'results.mp4') utils.image_to_video(os.path.join('../', 'temp', 'img'), video_name ) utils.add_audio(video_name, config.in_file) print ('The generated video is: {}'.format(os.path.join(config.sample_dir , 'results.mov')))
def test(): os.environ["CUDA_VISIBLE_DEVICES"] = config.device_ids result_dir = 'temp/' + config.in_file motion_dir = result_dir + '/motion/' os.mkdir(result_dir) os.mkdir(motion_dir) pca = torch.FloatTensor(np.load('basics/pca.npy')[:, :6]) mean = torch.FloatTensor(np.load('basics/mean.npy')) decoder = VG_net() encoder = AT_net() state_dict2 = multi2single(config.vg_model, 1) decoder.load_state_dict(state_dict2) state_dict = multi2single(config.at_model, 1) encoder.load_state_dict(state_dict) encoder.eval() decoder.eval() test_file = result_dir + "/" + config.in_file + ".wav" test_file_old = result_dir + "/old_" + config.in_file + ".wav" if config.text_tts == "" and config.news_url != "": parse_news_content = get_info(config.news_url)['news_content'] else: parse_news_content = config.text_tts tts = TTS(config.name_tts, "wav", "000000-0000-0000-0000-00000000", config.lang_tts, emotion="neutral", speed=1) # test content tts.generate(parse_news_content[:1999]) if config.shift == 1: tts.save(test_file_old) audio_shift(test_file_old, test_file) else: tts.save(test_file) example_image, example_landmark = generator_demo_example_lips( config.person) transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) ]) example_image = cv2.cvtColor(example_image, cv2.COLOR_BGR2RGB) example_image = transform(example_image) example_landmark = example_landmark.reshape( (1, example_landmark.shape[0] * example_landmark.shape[1])) if config.cuda == True: example_image = Variable(example_image.view(1, 3, 128, 128)).cuda() example_landmark = Variable( torch.FloatTensor(example_landmark.astype(float))).cuda() else: example_image = Variable(example_image.view(1, 3, 128, 128)) example_landmark = Variable( torch.FloatTensor(example_landmark.astype(float))) example_landmark = example_landmark * 5.0 example_landmark = example_landmark - mean.expand_as(example_landmark) example_landmark = torch.mm(example_landmark, pca) speech, sr = librosa.load(test_file, sr=16000) mfcc = python_speech_features.mfcc(speech, 16000, winstep=0.01) speech = np.insert(speech, 0, np.zeros(1920)) speech = np.append(speech, np.zeros(1920)) mfcc = python_speech_features.mfcc(speech, 16000, winstep=0.01) sound, _ = librosa.load(test_file, sr=44100) print('=======================================') print('Generate images') t = time.time() ind = 3 with torch.no_grad(): fake_lmark = [] input_mfcc = [] while ind <= int(mfcc.shape[0] / 4) - 4: t_mfcc = mfcc[(ind - 3) * 4:(ind + 4) * 4, 1:] t_mfcc = torch.FloatTensor(t_mfcc) input_mfcc.append(t_mfcc) ind += 1 input_mfcc = torch.stack(input_mfcc, dim=0) input_mfcc = input_mfcc.unsqueeze(0) fake_lmark = encoder(example_landmark, input_mfcc) fake_lmark = fake_lmark.view( fake_lmark.size(0) * fake_lmark.size(1), 6) example_landmark = torch.mm(example_landmark, pca.t()) example_landmark = example_landmark + mean.expand_as(example_landmark) fake_lmark[:, 1:6] *= 2 * torch.FloatTensor( np.array([1.1, 1.2, 1.3, 1.4, 1.5])) fake_lmark = torch.mm(fake_lmark, pca.t()) fake_lmark = fake_lmark + mean.expand_as(fake_lmark) fake_lmark = fake_lmark.unsqueeze(0) fake_lmark = fake_lmark.data.cpu().numpy() file_mark = result_dir + "/" + config.in_file + ".npy" file_mp4 = result_dir + "/" + config.in_file # + ".mp4" np.save(file_mark, fake_lmark) mark_paint.mark_video(fake_lmark, motion_dir) cmd = 'ffmpeg -framerate 25 -i ' + motion_dir + '%d.png -filter:v scale=512:-1 -c:v libx264 -pix_fmt yuv420p ' + file_mp4 + '.mp4' subprocess.call(cmd, shell=True) print('video done') cmd = 'ffmpeg -i ' + file_mp4 + '.mp4 -i ' + test_file + ' -c:v copy -c:a aac -strict experimental ' + file_mp4 + '_result.mp4' subprocess.call(cmd, shell=True) print('video+audio done') return file_mark return False
def test(): data_root = '/home/cxu-serve/p1/common/voxceleb2/unzip/test_video/' #data_root = '/home/cxu-serve/p1/common/lrs3/lrs3_v0.4/' audios = [] videos = [] start_ids = [] end_ids = [] with open( '/home/cxu-serve/p1/common/degree/degree_store/vox/new_extra_data.csv', 'r') as csvfile: reader = csv.reader(csvfile) for row in reader: #print (row) audios.append(row[1]) videos.append(row[0]) start_ids.append(int(row[2])) end_ids.append(int(row[3])) #audios.append(os.path.join(data_root, 'test',tmp[1], tmp[2] + '.wav')) #videos.append(os.path.join(data_root, 'test', tmp[1], tmp[2] + '_crop.mp4')) #print (gg) os.environ["CUDA_VISIBLE_DEVICES"] = config.device_ids if os.path.exists('../temp'): shutil.rmtree('../temp') os.mkdir('../temp') pca = torch.FloatTensor(np.load('../basics/U_lrw1.npy')[:, :6]).cuda() mean = torch.FloatTensor(np.load('../basics/mean_lrw1.npy')).cuda() decoder = VG_net() encoder = AT_net() if config.cuda: encoder = encoder.cuda() decoder = decoder.cuda() state_dict2 = multi2single(config.vg_model, 1) # state_dict2 = torch.load(config.video_model, map_location=lambda storage, loc: storage) decoder.load_state_dict(state_dict2) state_dict = multi2single(config.at_model, 1) encoder.load_state_dict(state_dict) encoder.eval() decoder.eval() # vox # # get file paths # path = '/home/cxu-serve/p1/common/experiment/vox_good' # files = os.listdir(path) # data_root = '/home/cxu-serve/p1/common/voxceleb2/unzip/' # audios = [] # videos = [] # for f in files: # if f[:7] == 'id00817' : # audios.append(os.path.join(data_root, 'test_audio', f[:7], f[8:-14], '{}.wav'.format(f.split('_')[-2]))) # videos.append(os.path.join(data_root, 'test_video', f[:7], f[8:-14], '{}_aligned.mp4'.format(f.split('_')[-2]))) # for i in range(len(audios)): # audio_file = audios[i] # video_file = videos[i] # test_file = audio_file # image_path = video_file # video_name = image_path.split('/')[-3] + '__' + image_path.split('/')[-2] +'__' + image_path.split('/')[-1][:-4] # get file paths #path = '/home/cxu-serve/p1/common/other/lrs_good2' #files = os.listdir(path) #for f in files: # print (f) # if f[:4] !='test': # continue # tmp = f.split('_')# # if f[:7] == 'id00817' : for i in range(len(audios)): try: audio_file = audios[i] video_file = videos[i] test_file = audio_file image_path = video_file video_name = video_file.split('/')[-1][:-4] + '_' + str( start_ids[i]) #image_path = os.path.join('../image', video_name + '.jpg') #print (video_name, image_path) #cap = cv2.VideoCapture(video_file) #imgs = [] #count = 0 #while(cap.isOpened()): # count += 1 # ret, frame = cap.read() # if count != 33: # continue # else: # cv2.imwrite(image_path, frame) # try: # example_image, example_landmark = generator_demo_example_lips(image_path) # except: # continue # break example_image, example_landmark = generator_demo_example_lips( image_path) transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) ]) example_image = cv2.cvtColor(example_image, cv2.COLOR_BGR2RGB) example_image = transform(example_image) example_landmark = example_landmark.reshape( (1, example_landmark.shape[0] * example_landmark.shape[1])) if config.cuda: example_image = Variable(example_image.view(1, 3, 128, 128)).cuda() example_landmark = Variable( torch.FloatTensor(example_landmark.astype(float))).cuda() else: example_image = Variable(example_image.view(1, 3, 128, 128)) example_landmark = Variable( torch.FloatTensor(example_landmark.astype(float))) # Load speech and extract features example_landmark = example_landmark * 5.0 example_landmark = example_landmark - mean.expand_as( example_landmark) example_landmark = torch.mm(example_landmark, pca) speech, sr = librosa.load(audio_file, sr=16000) mfcc = python_speech_features.mfcc(speech, 16000, winstep=0.01) speech = np.insert(speech, 0, np.zeros(1920)) speech = np.append(speech, np.zeros(1920)) mfcc = python_speech_features.mfcc(speech, 16000, winstep=0.01) # print (mfcc.shape) #sound, _ = librosa.load(test_file, sr=44100) print('=======================================') print('Start to generate images') t = time.time() ind = 3 with torch.no_grad(): fake_lmark = [] input_mfcc = [] while ind <= int(mfcc.shape[0] / 4) - 4: t_mfcc = mfcc[(ind - 3) * 4:(ind + 4) * 4, 1:] t_mfcc = torch.FloatTensor(t_mfcc).cuda() if ind >= start_ids[i] and ind < end_ids[i]: input_mfcc.append(t_mfcc) ind += 1 input_mfcc = torch.stack(input_mfcc, dim=0) input_mfcc = input_mfcc.unsqueeze(0) print(input_mfcc.shape) fake_lmark = encoder(example_landmark, input_mfcc) fake_lmark = fake_lmark.view( fake_lmark.size(0) * fake_lmark.size(1), 6) example_landmark = torch.mm(example_landmark, pca.t()) example_landmark = example_landmark + mean.expand_as( example_landmark) fake_lmark[:, 1:6] *= 2 * torch.FloatTensor( np.array([1.1, 1.2, 1.3, 1.4, 1.5])).cuda() fake_lmark = torch.mm(fake_lmark, pca.t()) fake_lmark = fake_lmark + mean.expand_as(fake_lmark) fake_lmark = fake_lmark.unsqueeze(0) fake_ims, _, _, _ = decoder(example_image, fake_lmark, example_landmark) os.system('rm ../temp/*') for indx in range(fake_ims.size(1)): fake_im = fake_ims[:, indx] fake_store = fake_im.permute(0, 2, 3, 1).data.cpu().numpy()[0] scipy.misc.imsave( "{}/{:05d}.png".format(os.path.join('../', 'temp'), indx), fake_store) print(time.time() - t) fake_lmark = fake_lmark.data.cpu().numpy() # os.system('rm ../results/*') # np.save( os.path.join( config.sample_dir, 'obama_fake.npy'), fake_lmark) # fake_lmark = np.reshape(fake_lmark, (fake_lmark.shape[1], 68, 2)) # utils.write_video_wpts_wsound(fake_lmark, sound, 44100, config.sample_dir, 'fake', [-1.0, 1.0], [-1.0, 1.0]) video_name = os.path.join(config.sample_dir, video_name) # ffmpeg.input('../temp/*.png', pattern_type='glob', framerate=25).output(video_name).run() utils.image_to_video(os.path.join('../', 'temp'), video_name + '.mp4') utils.add_audio(video_name + '.mp4', audio_file) print('The generated video is: {}'.format( os.path.join(config.sample_dir, video_name + '.mov'))) except: continue
def test(): os.environ["CUDA_VISIBLE_DEVICES"] = config.device_ids config.is_train = 'test' if config.lstm == True: generator = AT_net() else: generator = AT_single() if config.dataset == 'grid': pca = torch.FloatTensor(np.load('../basics/U_grid.npy')[:, :6]).cuda() mean = torch.FloatTensor(np.load('../basics/mean_grid.npy')).cuda() elif config.dataset == 'lrw': pca = torch.FloatTensor(np.load('../basics/U_lrw1.npy')[:, :6]).cuda() mean = torch.FloatTensor(np.load('../basics/mean_lrw1.npy')).cuda() else: raise Exception('wrong key word for the dataset input') xLim = (-1.0, 1.0) yLim = (-1.0, 1.0) xLab = 'x' yLab = 'y' state_dict = multi2single(config.model_name, 1) generator.load_state_dict(state_dict) print('load pretrained [{}]'.format(config.model_name)) if config.lstm: if config.dataset == 'lrw': dataset = LRW_1D_lstm_landmark_pca(config.dataset_dir, train=config.is_train) else: dataset = GRID_1D_lstm_landmark_pca(config.dataset_dir, train=config.is_train) else: if config.dataset == 'lrw': dataset = LRW_1D_single_landmark_pca(config.dataset_dir, train=config.is_train) else: dataset = GRID_1D_single_landmark_pca(config.dataset_dir, train=config.is_train) data_loader = DataLoader(dataset, batch_size=config.batch_size, num_workers=config.num_thread, shuffle=False, drop_last=True) data_iter = iter(data_loader) data_iter.next() if not os.path.exists(config.sample_dir): os.mkdir(config.sample_dir) if not os.path.exists(os.path.join(config.sample_dir, 'fake')): os.mkdir(os.path.join(config.sample_dir, 'fake')) if not os.path.exists(os.path.join(config.sample_dir, 'real')): os.mkdir(os.path.join(config.sample_dir, 'real')) if config.cuda: generator = generator.cuda() generator.eval() for step, (example_landmark, example_audio, lmark, audio) in enumerate(data_loader): with torch.no_grad(): print(step) if step == 5: break if config.cuda: example_audio = Variable(example_audio.float()).cuda() lmark = Variable(lmark.float()).cuda() audio = Variable(audio.float()).cuda() example_landmark = Variable(example_landmark.float()).cuda() if config.lstm: fake_lmark = generator(example_landmark, audio) fake_lmark = fake_lmark.view( fake_lmark.size(0) * fake_lmark.size(1), 6) fake_lmark[:, 1:6] *= 2 * torch.FloatTensor( np.array([1.2, 1.4, 1.6, 1.8, 2.0])).cuda() fake_lmark = torch.mm(fake_lmark, pca.t()) fake_lmark = fake_lmark + mean.expand_as(fake_lmark) fake_lmark = fake_lmark.view(config.batch_size, 16, 136) fake_lmark = fake_lmark.data.cpu().numpy() lmark = lmark.view(lmark.size(0) * lmark.size(1), 6) lmark = torch.mm(lmark, pca.t()) lmark = lmark + mean.expand_as(lmark) lmark = lmark.view(config.batch_size, 16, 136) lmark = lmark.data.cpu().numpy() for indx in range(config.batch_size): for jj in range(16): name = "{}real_{}_{}_{}.png".format( config.sample_dir, step, indx, jj) utils.plot_flmarks(lmark[indx, jj], name, xLim, yLim, xLab, yLab, figsize=(10, 10)) name = "{}fake_{}_{}_{}.png".format( config.sample_dir, step, indx, jj) utils.plot_flmarks(fake_lmark[indx, jj], name, xLim, yLim, xLab, yLab, figsize=(10, 10)) else: fake_lmark = generator(example_landmark, audio) fake_lmark[:, 1:6] *= 2 * torch.FloatTensor( np.array([1.2, 1.4, 1.6, 1.8, 2.0])).cuda() fake_lmark = torch.mm(fake_lmark, pca.t()) fake_lmark = fake_lmark + mean.expand_as(fake_lmark) fake_lmark = fake_lmark.data.cpu().numpy() lmark = torch.mm(lmark, pca.t()) lmark = lmark + mean.expand_as(lmark) lmark = lmark.data.cpu().numpy() for indx in range(config.batch_size): name = '{}real/real_{}.png'.format( config.sample_dir, step * config.batch_size + indx) utils.plot_flmarks(lmark[indx], name, xLim, yLim, xLab, yLab, figsize=(10, 10)) name = '{}fake/fake_{}.png'.format( config.sample_dir, step * config.batch_size + indx) utils.plot_flmarks(fake_lmark[indx], name, xLim, yLim, xLab, yLab, figsize=(10, 10))