def animate(args): # get context ctx = get_extension_context(args.context) nn.set_default_context(ctx) logger.setLevel(logging.ERROR) # to supress minor messages if not args.config: assert not args.params, "pretrained weights file is given, but corresponding config file is not. Please give both." download_provided_file( "https://nnabla.org/pretrained-models/nnabla-examples/GANs/first-order-model/voxceleb_trained_info.yaml" ) args.config = 'voxceleb_trained_info.yaml' download_provided_file( "https://nnabla.org/pretrained-models/nnabla-examples/GANs/first-order-model/pretrained_fomm_params.h5" ) config = read_yaml(args.config) dataset_params = config.dataset_params model_params = config.model_params if args.detailed: vis_params = config.visualizer_params visualizer = Visualizer(**vis_params) if not args.params: assert "log_dir" in config, "no log_dir found in config. therefore failed to locate pretrained parameters." param_file = os.path.join(config.log_dir, config.saved_parameters) else: param_file = args.params print(f"Loading {param_file} for image animation...") nn.load_parameters(param_file) bs, h, w, c = [1] + dataset_params.frame_shape source = nn.Variable((bs, c, h, w)) driving_initial = nn.Variable((bs, c, h, w)) driving = nn.Variable((bs, c, h, w)) filename = args.driving # process repeated until all the test data is used driving_video = read_video( filename, dataset_params.frame_shape) # (#frames, h, w, 3) driving_video = np.transpose(driving_video, (0, 3, 1, 2)) # (#frames, 3, h, w) source_img = imread(args.source, channel_first=True, size=(256, 256)) / 255. source_img = source_img[:3] source.d = np.expand_dims(source_img, 0) driving_initial.d = driving_video[0][:3, ] with nn.parameter_scope("kp_detector"): kp_source = detect_keypoint(source, **model_params.kp_detector_params, **model_params.common_params, test=True, comm=False) persistent_all(kp_source) with nn.parameter_scope("kp_detector"): kp_driving_initial = detect_keypoint(driving_initial, **model_params.kp_detector_params, **model_params.common_params, test=True, comm=False) persistent_all(kp_driving_initial) with nn.parameter_scope("kp_detector"): kp_driving = detect_keypoint(driving, **model_params.kp_detector_params, **model_params.common_params, test=True, comm=False) persistent_all(kp_driving) if args.adapt_movement_scale: nn.forward_all([ kp_source["value"], kp_source["jacobian"], kp_driving_initial["value"], kp_driving_initial["jacobian"] ]) source_area = ConvexHull(kp_source['value'].d[0]).volume driving_area = ConvexHull(kp_driving_initial['value'].d[0]).volume adapt_movement_scale = np.sqrt(source_area) / np.sqrt(driving_area) else: adapt_movement_scale = 1 kp_norm = adjust_kp(kp_source=unlink_all(kp_source), kp_driving=kp_driving, kp_driving_initial=unlink_all(kp_driving_initial), adapt_movement_scale=adapt_movement_scale, use_relative_movement=args.unuse_relative_movement, use_relative_jacobian=args.unuse_relative_jacobian) persistent_all(kp_norm) with nn.parameter_scope("generator"): generated = occlusion_aware_generator(source, kp_source=unlink_all(kp_source), kp_driving=kp_norm, **model_params.generator_params, **model_params.common_params, test=True, comm=False) if not args.full and 'sparse_deformed' in generated: del generated['sparse_deformed'] # remove needless info persistent_all(generated) generated['kp_driving'] = kp_driving generated['kp_source'] = kp_source generated['kp_norm'] = kp_norm # generated contains these values; # 'mask': <Variable((bs, num_kp+1, h/4, w/4)) when scale_factor=0.25 # 'sparse_deformed': <Variable((bs, num_kp+1, num_channel, h/4, w/4)) # (bs, num_kp + 1, c, h, w) # 'occlusion_map': <Variable((bs, 1, h/4, w/4)) # 'deformed': <Variable((bs, c, h, w)) # 'prediction': <Variable((bs, c, h, w)) mode = "arbitrary" if "log_dir" in config: result_dir = os.path.join(args.out_dir, os.path.basename(config.log_dir), f"{mode}") else: result_dir = os.path.join(args.out_dir, "test_result", f"{mode}") # create an empty directory to save generated results _ = nm.Monitor(result_dir) # load the header images. header = imread("imgs/header_combined.png", channel_first=True) generated_images = list() # compute these in advance and reuse nn.forward_all([kp_source["value"], kp_source["jacobian"]], clear_buffer=True) nn.forward_all( [kp_driving_initial["value"], kp_driving_initial["jacobian"]], clear_buffer=True) num_of_driving_frames = driving_video.shape[0] for frame_idx in tqdm(range(num_of_driving_frames)): driving.d = driving_video[frame_idx][:3, ] nn.forward_all([generated["prediction"], generated["deformed"]], clear_buffer=True) if args.detailed: # visualize source w/kp, driving w/kp, deformed source, generated w/kp, generated image, occlusion map visualization = visualizer.visualize(source=source.d, driving=driving.d, out=generated) if args.full: visualization = reshape_result(visualization) # (H, W, C) combined_image = visualization.transpose(2, 0, 1) # (C, H, W) elif args.only_generated: combined_image = np.clip(generated["prediction"].d[0], 0.0, 1.0) combined_image = (255 * combined_image).astype( np.uint8) # (C, H, W) else: # visualize source, driving, and generated image driving_fake = np.concatenate([ np.clip(driving.d[0], 0.0, 1.0), np.clip(generated["prediction"].d[0], 0.0, 1.0) ], axis=2) header_source = np.concatenate([ np.clip(header / 255., 0.0, 1.0), np.clip(source.d[0], 0.0, 1.0) ], axis=2) combined_image = np.concatenate([header_source, driving_fake], axis=1) combined_image = (255 * combined_image).astype(np.uint8) generated_images.append(combined_image) # once each video is generated, save it. output_filename = f"{os.path.splitext(os.path.basename(filename))[0]}.mp4" output_filename = f"{os.path.basename(args.source)}_by_{output_filename}" output_filename = output_filename.replace("#", "_") if args.output_png: monitor_vis = nm.MonitorImage(output_filename, nm.Monitor(result_dir), interval=1, num_images=1, normalize_method=lambda x: x) for frame_idx, img in enumerate(generated_images): monitor_vis.add(frame_idx, img) else: generated_images = [_.transpose(1, 2, 0) for _ in generated_images] # you might need to change ffmpeg_params according to your environment. mimsave(f'{os.path.join(result_dir, output_filename)}', generated_images, fps=args.fps, ffmpeg_params=[ "-pix_fmt", "yuv420p", "-vcodec", "libx264", "-f", "mp4", "-q", "0" ]) return
Logger.load_cpk(opt.checkpoint, generator=generator, kp_detector=kp_detector, use_cpu=False) vis = Visualizer() # generator = DataParallelWithCallback(generator) # kp_detector = DataParallelWithCallback(kp_detector) generator.eval() kp_detector.eval() with torch.no_grad(): driving_video = VideoToTensor()(read_video( opt.driving_video, opt.image_shape + (3, )))['video'] source_image = VideoToTensor()(read_video( opt.source_image, opt.image_shape + (3, )))['video'][:, :1] print(source_image.shape) driving_video = torch.from_numpy(driving_video).unsqueeze(0) source_image = torch.from_numpy(source_image).unsqueeze(0) out = transfer_one(generator, kp_detector, source_image, driving_video, config['transfer_params']) ''' # Pickle the out f = open('keypoints.pkl', 'wb') pickle.dump(out, f) f.close() '''
def __getitem__(self, idx): # will return index of source videos # and randomly return index of a target videos if self.is_train and self.id_sampling: name_source = self.source_videos[idx] path_source = np.random.choice( glob.glob(os.path.join(self.source_dir, name_source + '*.mp4'))) name_target = self.target_videos[idx % len(self.target_videos)] path_target = np.random.choice( glob.glob(os.path.join(self.target_dir, name_target + '*.mp4'))) else: name_source = self.source_videos[idx] path_source = os.path.join(self.source_dir, name_source) name_target = self.target_videos[idx % len(self.target_videos)] path_target = os.path.join(self.target_dir, name_target) video_src_name = os.path.basename(path_source) video_tar_name = os.path.basename(path_target) # handle source # 此情况是 path 是一个文件夹,里面装了每一帧的 png if self.is_train and os.path.isdir(path_source): frames = os.listdir(path_source) num_frames = len(frames) frame_idx = np.sort( np.random.choice(num_frames, replace=True, size=2)) source_array = [ img_as_float32( io.imread(os.path.join(path_source, frames[idx]))) for idx in frame_idx ] else: # 读取视频 source_array = read_video(path_source, frame_shape=self.frame_shape) num_frames = len(source_array) # 此处根据模式选项,打乱了视频顺序 frame_idx = np.sort( np.random.choice( num_frames, replace=True, size=2)) if self.is_train else range(num_frames) source_array = source_array[frame_idx] # handle target # 此情况是 path 是一个文件夹,里面装了每一帧的 png if self.is_train and os.path.isdir(path_target): frames = os.listdir(path_target) num_frames = len(frames) frame_idx = np.sort( np.random.choice(num_frames, replace=True, size=2)) target_array = [ img_as_float32( io.imread(os.path.join(path_target, frames[idx]))) for idx in frame_idx ] else: # 读取视频 target_array = read_video(path_target, frame_shape=self.frame_shape) num_frames = len(target_array) # 此处根据模式选项,打乱了视频顺序 frame_idx = np.sort( np.random.choice( num_frames, replace=True, size=2)) if self.is_train else range(num_frames) target_array = target_array[frame_idx] if self.transform is not None: source_array = self.transform(source_array) target_array = self.transform(target_array) out = {} # 构建输出字典 if self.is_train: # 输出的时候只选取了前两帧作为源和驱动 # 注:此处把 channel 作为第一个维度输出了 s_source = np.array(source_array[0], dtype='float32') s_driving = np.array(source_array[1], dtype='float32') t_source = np.array(target_array[0], dtype='float32') t_driving = np.array(target_array[1], dtype='float32') out['driving'] = s_driving.transpose((2, 0, 1)) out['source'] = s_source.transpose((2, 0, 1)) out['t_driving'] = t_driving.transpose((2, 0, 1)) out['t_source'] = t_source.transpose((2, 0, 1)) else: video = np.array(source_array, dtype='float32') out['video'] = video.transpose((3, 0, 1, 2)) video = np.array(target_array, dtype='float32') out['t_video'] = video.transpose((3, 0, 1, 2)) out['name'] = video_src_name out['t_name'] = video_tar_name return out
def reconstruct(args): # get context ctx = get_extension_context(args.context) nn.set_default_context(ctx) logger.setLevel(logging.ERROR) # to supress minor messages config = read_yaml(args.config) dataset_params = config.dataset_params model_params = config.model_params if args.detailed: vis_params = config.visualizer_params visualizer = Visualizer(**vis_params) if not args.params: assert "log_dir" in config, "no log_dir found in config. therefore failed to locate pretrained parameters." param_file = os.path.join( config.log_dir, config.saved_parameters) else: param_file = args.params nn.load_parameters(param_file) bs, h, w, c = [1] + dataset_params.frame_shape source = nn.Variable((bs, c, h, w)) driving_initial = nn.Variable((bs, c, h, w)) driving = nn.Variable((bs, c, h, w)) with nn.parameter_scope("kp_detector"): kp_source = detect_keypoint(source, **model_params.kp_detector_params, **model_params.common_params, test=True, comm=False) persistent_all(kp_source) with nn.parameter_scope("kp_detector"): kp_driving = detect_keypoint(driving, **model_params.kp_detector_params, **model_params.common_params, test=True, comm=False) persistent_all(kp_driving) with nn.parameter_scope("generator"): generated = occlusion_aware_generator(source, kp_source=unlink_all(kp_source), kp_driving=kp_driving, **model_params.generator_params, **model_params.common_params, test=True, comm=False) if not args.full and 'sparse_deformed' in generated: del generated['sparse_deformed'] # remove needless info persistent_all(generated) generated['kp_driving'] = kp_driving generated['kp_source'] = kp_source # generated contains these values; # 'mask': <Variable((bs, num_kp+1, h/4, w/4)) when scale_factor=0.25 # 'sparse_deformed': <Variable((bs, num_kp+1, num_channel, h/4, w/4)) # (bs, num_kp + 1, c, h, w) # 'occlusion_map': <Variable((bs, 1, h/4, w/4)) # 'deformed': <Variable((bs, c, h, w)) # 'prediction': <Variable((bs, c, h, w)) mode = "reconstruction" if "log_dir" in config: result_dir = os.path.join(args.out_dir, os.path.basename(config.log_dir), f"{mode}") else: result_dir = os.path.join(args.out_dir, "test_result", f"{mode}") # create an empty directory to save generated results _ = nm.Monitor(result_dir) if args.eval: os.makedirs(os.path.join(result_dir, "png"), exist_ok=True) # load the header images. header = imread("imgs/header_combined.png", channel_first=True) filenames = sorted(glob.glob(os.path.join( dataset_params.root_dir, "test", "*"))) recon_loss_list = list() for filename in tqdm(filenames): # process repeated until all the test data is used driving_video = read_video( filename, dataset_params.frame_shape) # (#frames, h, w, 3) driving_video = np.transpose( driving_video, (0, 3, 1, 2)) # (#frames, 3, h, w) generated_images = list() source_img = driving_video[0] source.d = np.expand_dims(source_img, 0) driving_initial.d = driving_video[0] # compute these in advance and reuse nn.forward_all( [kp_source["value"], kp_source["jacobian"]], clear_buffer=True) num_of_driving_frames = driving_video.shape[0] for frame_idx in tqdm(range(num_of_driving_frames)): driving.d = driving_video[frame_idx] nn.forward_all([generated["prediction"], generated["deformed"]], clear_buffer=True) if args.detailed: # visualize source w/kp, driving w/kp, deformed source, generated w/kp, generated image, occlusion map visualization = visualizer.visualize( source=source.d, driving=driving.d, out=generated) if args.full: visualization = reshape_result(visualization) # (H, W, C) combined_image = visualization.transpose(2, 0, 1) # (C, H, W) elif args.only_generated: combined_image = np.clip( generated["prediction"].d[0], 0.0, 1.0) combined_image = ( 255*combined_image).astype(np.uint8) # (C, H, W) else: # visualize source, driving, and generated image driving_fake = np.concatenate([np.clip(driving.d[0], 0.0, 1.0), np.clip(generated["prediction"].d[0], 0.0, 1.0)], axis=2) header_source = np.concatenate([np.clip(header / 255., 0.0, 1.0), np.clip(source.d[0], 0.0, 1.0)], axis=2) combined_image = np.concatenate( [header_source, driving_fake], axis=1) combined_image = (255*combined_image).astype(np.uint8) generated_images.append(combined_image) # compute L1 distance per frame. recon_loss_list.append( np.mean(np.abs(generated["prediction"].d[0] - driving.d[0]))) # post process only for reconstruction evaluation. if args.eval: # crop generated images region only. if args.only_generated: eval_images = generated_images elif args.full: eval_images = [_[:, :h, 4*w:5*w] for _ in generated_images] elif args.detailed: assert generated_images[0].shape == (c, h, 5*w) eval_images = [_[:, :, 3*w:4*w] for _ in generated_images] else: eval_images = [_[:, h:, w:] for _ in generated_images] # place them horizontally and save for evaluation. image_for_eval = np.concatenate( eval_images, axis=2).transpose(1, 2, 0) imsave(os.path.join(result_dir, "png", f"{os.path.basename(filename)}.png"), image_for_eval) # once each video is generated, save it. output_filename = f"{os.path.splitext(os.path.basename(filename))[0]}.mp4" if args.output_png: monitor_vis = nm.MonitorImage(output_filename, nm.Monitor(result_dir), interval=1, num_images=1, normalize_method=lambda x: x) for frame_idx, img in enumerate(generated_images): monitor_vis.add(frame_idx, img) else: generated_images = [_.transpose(1, 2, 0) for _ in generated_images] # you might need to change ffmpeg_params according to your environment. mimsave(f'{os.path.join(result_dir, output_filename)}', generated_images, fps=args.fps, ffmpeg_params=["-pix_fmt", "yuv420p", "-vcodec", "libx264", "-f", "mp4", "-q", "0"]) print(f"Reconstruction loss: {np.mean(recon_loss_list)}") return
extract the pose points for the first frame of the GIF for each GIF. This allows for an alignment based on only the first frame. TODO: Extend this to extract poses from the driving video to then obtain poses at each frame for alignment. ''' with torch.no_grad(): # This dictionary stores the initial pose for each of the GIFs poses_dict = {} for img_name in tqdm(os.listdir(opt.driving_directory)): path_name = opt.driving_directory + img_name driving_video = VideoToTensor()(read_video(path_name, opt.image_shape + (3,)))['video'] driving_video = torch.from_numpy(driving_video).unsqueeze(0) cat_dict = lambda l, dim: {k: torch.cat([v[k] for v in l], dim=dim) for k in l[0]} d = driving_video.shape[2] kp_driving = cat_dict([kp_detector(driving_video[:,:,i:(i+1)]) for i in range(d)], dim=1) poses_dict[img_name] = kp_driving # Dump the poses dict pickle.dump(poses_dict, open('./driving_video_poses/{}_poses.pkl'.format(opt.name), 'wb'))
extract the pose points for the first frame of the GIF for each GIF. This allows for an alignment based on only the first frame. TODO: Extend this to extract poses from the driving video to then obtain poses at each frame for alignment. ''' with torch.no_grad(): # This dictionary stores the initial pose for each of the GIFs poses_dict = {} for img_name in tqdm(os.listdir(opt.source_directory)): path_name = opt.source_directory + img_name source_image = VideoToTensor()(read_video(path_name, opt.image_shape + (3,)))['video'][:, :1] print(source_image.shape) source_image = torch.from_numpy(source_image).unsqueeze(0) #Extract the mean of the keypoints mean = kp_detector(source_image)['mean'].data.cpu().numpy() # Apply the transformation key_points = 128 * (mean + 1) / 2 poses_dict[img_name[:-4]] = key_points if opt.visualize: # img = vis.visualize_initial_pose(source_image, mean) img = plt.imread(path_name)