def save_videos(tensor: Union[torch.tensor, List[torch.Tensor]], fp: Union[Text, pathlib.Path, BinaryIO], format: Optional[str] = None, **kwargs) -> None: #print(2) os.makedirs('sample_frame_v2', exist_ok=True) s_size, channel, fr, h, w = tensor.shape f_name_base, fmt = fp.split('.') for f in range(fr): tensor_tmp = tensor[:, :, f] f_name = '-'.join([f_name_base, f'fr_{f}', f'.{fmt}']) #print(f_name) save_image(tensor_tmp, f_name, **kwargs) merge_list = [] for f in range(fr): f_name = '-'.join([f_name_base, f'fr_{f}', f'.{fmt}']) ee = Image.open(f_name) merge_list.append(np.array(ee)) merge_list = np.array(merge_list) f_name_base = f_name_base.replace('sample_frame_v2', 'sample_video_v2') os.makedirs('sample_video_v2', exist_ok=True) save_name = f_name_base + '.avi' save_video(merge_list, save_name, '.', bgr=False, fr_rate=16) save_name = f_name_base + '.gif' save_gif(merge_list, save_name, '.', bgr=False, fr_rate=60) #remove current frame files shutil.rmtree('sample_frame_v2')
def main(imagedir, gtdir): # load net net_file = join(realpath(dirname(__file__)), 'SiamRPNBIG.model') net = SiamRPNBIG() net.load_state_dict(torch.load(net_file)) net.eval().cuda() # warm up for i in range(10): net.temple( torch.autograd.Variable(torch.FloatTensor(1, 3, 127, 127)).cuda()) net(torch.autograd.Variable(torch.FloatTensor(1, 3, 255, 255)).cuda()) # start to track # get the first frame groundtruth gt_file = os.path.join(gtdir, 'gt.txt') with open(gt_file, 'r') as f: lines = f.readlines() gt = [] for line in lines: line = line.split(' ') gt.append([int(float(x)) for x in line]) init_bbox = gt[0] # top-left x y,w,h target_pos, target_sz = rect_2_cxy_wh( init_bbox) # top-left x y,w,h --> center x y,w,h image_list = glob.glob(os.path.join(imagedir, '*.jpg')) image_list.sort() im = cv2.imread(image_list[0]) # HxWxC state = SiamRPN_init(im, target_pos, target_sz, net) # init tracker bboxes = [] for i in range(1, len(gt)): im = cv2.imread(image_list[i]) # HxWxC state = SiamRPN_track(state, im) # track res = cxy_wh_2_rect( state['target_pos'], state['target_sz']) # center x y,w,h --> top-left x y,w,h bboxes.append(res.tolist()) _, precision, precision_auc, iou = _compile_results(gt[1:], bboxes) print(' -- Precision ' + "(20 px)" + ': ' + "%.2f" % precision +\ ' -- Precision AUC: ' + "%.2f" % precision_auc + \ ' -- IOU: ' + "%.2f" % iou + ' --') isSavebbox = True if isSavebbox: print('saving bbox...') res_bbox_file = os.path.join('results_bbox.json') json.dump(bboxes, open(res_bbox_file, 'w'), indent=2) isSavevideo = True if isSavevideo: print('saving video...') save_video(image_list, bboxes) print('done')
def output_visuals(vis_rows, batch_data, outputs, args): # fetch data and predictions mag_mix = batch_data['mag_mix'] phase_mix = batch_data['phase_mix'] frames = batch_data['frames'] infos = batch_data['infos'] pred_masks_ = outputs['pred_masks'] gt_masks_ = outputs['gt_masks'] mag_mix_ = outputs['mag_mix'] weight_ = outputs['weight'] # unwarp log scale N = args.num_mix #-1 B = mag_mix.size(0) pred_masks_linear = [None for n in range(N)] gt_masks_linear = [None for n in range(N)] for n in range(N): if args.log_freq: grid_unwarp = torch.from_numpy( warpgrid(B, args.stft_frame // 2 + 1, gt_masks_[0].size(3), warp=False)).to(args.device) pred_masks_linear[n] = F.grid_sample(pred_masks_[n], grid_unwarp) gt_masks_linear[n] = F.grid_sample(gt_masks_[n], grid_unwarp) else: pred_masks_linear[n] = pred_masks_[n] gt_masks_linear[n] = gt_masks_[n] # convert into numpy mag_mix = mag_mix.numpy() mag_mix_ = mag_mix_.detach().cpu().numpy() phase_mix = phase_mix.numpy() weight_ = weight_.detach().cpu().numpy() for n in range(N): pred_masks_[n] = pred_masks_[n].detach().cpu().numpy() pred_masks_linear[n] = pred_masks_linear[n].detach().cpu().numpy() gt_masks_[n] = gt_masks_[n].detach().cpu().numpy() gt_masks_linear[n] = gt_masks_linear[n].detach().cpu().numpy() # threshold if binary mask if args.binary_mask: pred_masks_[n] = (pred_masks_[n] > args.mask_thres).astype( np.float32) pred_masks_linear[n] = (pred_masks_linear[n] > args.mask_thres).astype(np.float32) # loop over each sample for j in range(B): row_elements = [] # video names prefix = [] for n in range(N): prefix.append('-'.join( infos[n][0][j].split('/')[-2:]).split('.')[0]) prefix = '+'.join(prefix) makedirs(os.path.join(args.vis, prefix)) # save mixture mix_wav = istft_reconstruction(mag_mix[j, 0], phase_mix[j, 0], hop_length=args.stft_hop) mix_amp = magnitude2heatmap(mag_mix_[j, 0]) weight = magnitude2heatmap(weight_[j, 0], log=False, scale=100.) filename_mixwav = os.path.join(prefix, 'mix.wav') filename_mixmag = os.path.join(prefix, 'mix.jpg') filename_weight = os.path.join(prefix, 'weight.jpg') imsave(os.path.join(args.vis, filename_mixmag), mix_amp[::-1, :, :]) imsave(os.path.join(args.vis, filename_weight), weight[::-1, :]) wavfile.write(os.path.join(args.vis, filename_mixwav), args.audRate, mix_wav) row_elements += [{ 'text': prefix }, { 'image': filename_mixmag, 'audio': filename_mixwav }] # save each component preds_wav = [None for n in range(N)] for n in range(N): # GT and predicted audio recovery gt_mag = mag_mix[j, 0] * gt_masks_linear[n][j, 0] gt_wav = istft_reconstruction(gt_mag, phase_mix[j, 0], hop_length=args.stft_hop) pred_mag = mag_mix[j, 0] * pred_masks_linear[n][j, 0] preds_wav[n] = istft_reconstruction(pred_mag, phase_mix[j, 0], hop_length=args.stft_hop) # output masks filename_gtmask = os.path.join(prefix, 'gtmask{}.jpg'.format(n + 1)) filename_predmask = os.path.join(prefix, 'predmask{}.jpg'.format(n + 1)) gt_mask = (np.clip(gt_masks_[n][j, 0], 0, 1) * 255).astype( np.uint8) pred_mask = (np.clip(pred_masks_[n][j, 0], 0, 1) * 255).astype( np.uint8) imsave(os.path.join(args.vis, filename_gtmask), gt_mask[::-1, :]) imsave(os.path.join(args.vis, filename_predmask), pred_mask[::-1, :]) # ouput spectrogram (log of magnitude, show colormap) filename_gtmag = os.path.join(prefix, 'gtamp{}.jpg'.format(n + 1)) filename_predmag = os.path.join(prefix, 'predamp{}.jpg'.format(n + 1)) gt_mag = magnitude2heatmap(gt_mag) pred_mag = magnitude2heatmap(pred_mag) imsave(os.path.join(args.vis, filename_gtmag), gt_mag[::-1, :, :]) imsave(os.path.join(args.vis, filename_predmag), pred_mag[::-1, :, :]) # output audio filename_gtwav = os.path.join(prefix, 'gt{}.wav'.format(n + 1)) filename_predwav = os.path.join(prefix, 'pred{}.wav'.format(n + 1)) wavfile.write(os.path.join(args.vis, filename_gtwav), args.audRate, gt_wav) wavfile.write(os.path.join(args.vis, filename_predwav), args.audRate, preds_wav[n]) # output video frames_tensor = [ recover_rgb(frames[n][j, :, t]) for t in range(args.num_frames) ] frames_tensor = np.asarray(frames_tensor) path_video = os.path.join(args.vis, prefix, 'video{}.mp4'.format(n + 1)) save_video(path_video, frames_tensor, fps=args.frameRate / args.stride_frames) # combine gt video and audio filename_av = os.path.join(prefix, 'av{}.mp4'.format(n + 1)) combine_video_audio(path_video, os.path.join(args.vis, filename_gtwav), os.path.join(args.vis, filename_av)) row_elements += [{ 'video': filename_av }, { 'image': filename_predmag, 'audio': filename_predwav }, { 'image': filename_gtmag, 'audio': filename_gtwav }, { 'image': filename_predmask }, { 'image': filename_gtmask }] row_elements += [{'image': filename_weight}] vis_rows.append(row_elements)
default=-1, help="frame height of output video") args = parser.parse_args() with open(args.cascade, "r") as f: xml = f.read() stages, features, width, height = utils.parse_cascade(xml) image = cv2.imread(args.image, 0) image_height, image_width = image.shape[:2] new_image_height = int(image_height * args.scale) new_image_width = int(image_width * args.scale) image_scaled = cv2.resize(image, (new_image_width, new_image_height), interpolation=cv2.INTER_NEAREST) t0 = time.time() marked_images = utils.get_stage_images(image_scaled, stages, features, height, width, args.k) t1 = time.time() print(t1 - t0, "s") if args.output_width == -1 or args.output_height == -1: args.output_width = new_image_width args.output_height = new_image_height utils.save_video(args.output, marked_images, args.output_tps, args.output_width, args.output_height) print("saved output to", args.output)
def reconstruct_stim(features, net, img_mean=np.array((0, 0, 0)).astype(np.float32), img_std=np.array((1, 1, 1)).astype(np.float32), norm=255, bgr=False, initial_input=None, input_size=(224, 224, 3), feature_masks=None, layer_weight=None, channel=None, mask=None, opt_name='SGD', prehook_dict = {}, lr_start=0.02, lr_end=1e-12, momentum_start=0.009, momentum_end=0.009, decay_start=0.02, decay_end=1e-11, grad_normalize = True, image_jitter=False, jitter_size=4, image_blur=True, sigma_start=2, sigma_end=0.5, p=3, lamda=0.5, TVlambda = [0,0], clip_extreme=False, clip_extreme_every=4, e_pct_start=1, e_pct_end=1, clip_small_norm=False, clip_small_norm_every=4, n_pct_start=5., n_pct_end=5., loss_type='l2', iter_n=200, save_intermediate=False, save_intermediate_every=1, save_intermediate_path=None, disp_every=1, ): if loss_type == "l2": loss_fun = torch.nn.MSELoss(reduction='sum') elif loss_type == "L2_with_reg": loss_fun = MSE_with_regulariztion(L_lambda=lamda, alpha=p, TV_lambda=TVlambda) else: assert loss_type + ' is not correct' # make save dir if save_intermediate: if save_intermediate_path is None: save_intermediate_path = os.path.join('..', 'recon_img_by_icnn' + datetime.now().strftime('%Y%m%dT%H%M%S')) if not os.path.exists(save_intermediate_path): os.makedirs(save_intermediate_path) # image size input_size = input_size # image mean img_mean = img_mean img_std = img_std norm = norm # image norm noise_img = np.random.randint(0, 256, (input_size)) img_norm0 = np.linalg.norm(noise_img) img_norm0 = img_norm0/2. # initial input if initial_input is None: initial_input = np.random.randint(0, 256, (input_size)) else: input_size = initial_input.shape if save_intermediate: if len(input_size) == 3: #image save_name = 'initial_image.jpg' if bgr: PIL.Image.fromarray(np.uint8(initial_input[...,[2,1,0]])).save(os.path.join(save_intermediate_path, save_name)) else: PIL.Image.fromarray(np.uint8(initial_input)).save(os.path.join(save_intermediate_path, save_name)) elif len(input_size) == 4: # video # if you install cv2 and ffmpeg, you can use save_video function which save preferred video as video format save_name = 'initial_video.avi' save_video(initial_input, save_name, save_intermediate_path, bgr) save_name = 'initial_video.gif' save_gif(initial_input, save_name, save_intermediate_path, bgr, fr_rate=150) else: print('Input size is not appropriate for save') assert len(input_size) not in [3,4] # layer_list layer_dict = features layer_list = list(features.keys()) # number of layers num_of_layer = len(layer_list) # layer weight if layer_weight is None: weights = np.ones(num_of_layer) weights = np.float32(weights) weights = weights / weights.sum() layer_weight = {} for j, layer in enumerate(layer_list): layer_weight[layer] = weights[j] # feature mask if feature_masks is None: feature_masks = create_feature_masks(layer_dict, masks=mask, channels=channel) # iteration for gradient descent input = initial_input.copy().astype(np.float32) if len(input_size) == 3: input = img_preprocess(input, img_mean, img_std, norm) else: input = vid_preprocess(input, img_mean, img_std, norm) loss_list = np.zeros(iter_n, dtype='float32') for t in range(iter_n): # parameters lr = lr_start + t * (lr_end - lr_start) / iter_n momentum = momentum_start + t * (momentum_end - momentum_start) / iter_n decay = decay_start + t * (decay_end - decay_start) / iter_n sigma = sigma_start + t * (sigma_end - sigma_start) / iter_n # shift if image_jitter: ox, oy = np.random.randint(-jitter_size, jitter_size+1, 2) input = np.roll(np.roll(input, ox, -1), oy, -2) # forward input = torch.tensor(input[np.newaxis], requires_grad=True) if opt_name == 'Adam': #op = optim.Adam([input], lr = lr) op = optim.Adam([input], lr = lr) elif opt_name == 'SGD': op = optim.SGD([input], lr=lr, momentum=momentum) #op = optim.SGD([input], lr=lr) elif opt_name == 'Adadelta': op = optim.Adadelta([input],lr = lr) elif opt_name == 'Adagrad': op = optim.Adagrad([input], lr = lr) elif opt_name == 'AdamW': op = optim.AdamW([input], lr = lr) elif opt_name == 'SparseAdam': op = optim.SparseAdam([input], lr = lr) elif opt_name == 'Adamax': op = optim.Adamax([input], lr = lr) elif opt_name == 'ASGD': op = optim.ASGD([input], lr = lr) elif opt_name == 'RMSprop': op = optim.RMSprop([input], lr = lr) elif opt_name == 'Rprop': op = optim.Rprop([input], lr = lr) fw = get_cnn_features(net, input, features.keys(), prehook_dict) # backward for net err = 0. loss = 0. # set the grad of network to 0 net.zero_grad() op.zero_grad() for j in range(num_of_layer): # op.zero_grad() target_layer_id = num_of_layer -1 -j target_layer = layer_list[target_layer_id] # extract activation or mask at input true video, and mask act_j = fw[target_layer_id].clone() feat_j = features[target_layer].clone() mask_j = feature_masks[target_layer] layer_weight_j = layer_weight[target_layer] masked_act_j = torch.masked_select(act_j, torch.FloatTensor(mask_j).bool()) masked_feat_j = torch.masked_select(feat_j, torch.FloatTensor(mask_j).bool()) # calculate loss using pytorch loss function loss_j = loss_fun(masked_act_j, masked_feat_j) * layer_weight_j # backward the gradient to the video loss_j.backward(retain_graph=True) loss += loss_j.detach().numpy() if grad_normalize: grad_mean = torch.abs(input.grad).mean() if grad_mean > 0: input.grad /= grad_mean op.step() input = input.detach().numpy()[0] err = err + loss loss_list[t] = loss # clip pixels with extreme value if clip_extreme and (t+1) % clip_extreme_every == 0: e_pct = e_pct_start + t * (e_pct_end - e_pct_start) / iter_n input = clip_extreme_value(input, e_pct) # clip pixels with small norm if clip_small_norm and (t+1) % clip_small_norm_every == 0: n_pct = n_pct_start + t * (n_pct_end - n_pct_start) / iter_n input = clip_small_norm_value(input, n_pct) # unshift if image_jitter: input = np.roll(np.roll(input, -ox, -1), -oy, -2) # L_2 decay input = (1-decay) * input # gaussian blur if image_blur: if len(input_size) == 3: input = gaussian_blur(input, sigma) else: for i in range(input.shape[1]): input[:, i] = gaussian_blur(input[:, i], sigma) # disp info if (t+1) % disp_every == 0: print('iter=%d; err=%g;' % (t+1, err)) # save image if save_intermediate and ((t+1) % save_intermediate_every == 0): if len(input_size) == 3: save_name = '%05d.jpg' % (t+1) PIL.Image.fromarray(normalise_img(img_deprocess(input, img_mean, img_std, norm))).save( os.path.join(save_intermediate_path, save_name)) else: save_stim = input # if you install cv2 and ffmpeg, you can use save_video function which save preferred video as video format save_name = '%05d.avi' % (t + 1) save_video(normalise_vid(vid_deprocess(save_stim, img_mean, img_std, norm)), save_name, save_intermediate_path, bgr, fr_rate=30) save_name = '%05d.gif' % (t + 1) save_gif(normalise_vid(vid_deprocess(save_stim, img_mean, img_std, norm)), save_name, save_intermediate_path, bgr, fr_rate=150) # return img if len(input_size) == 3: return img_deprocess(input, img_mean, img_std, norm), loss_list else: return vid_deprocess(input, img_mean, img_std, norm), loss_list
parser.add_argument('--path', default="inference", type=str, metavar='DIR', help='path to get images') device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') args = parser.parse_args() file_names = sorted(os.listdir(args.path)) mymodel = ResnetGenerator() mymodel.to(device) os.makedirs(os.path.join("result"), exist_ok=True) mymodel.load_state_dict( torch.load(os.path.join("model_weight", 'best_weight.pt'), map_location=device)['G_state_dict']) mymodel.eval() for i in range(len(file_names)): video = read_video(os.path.join(args.path, file_names[i]), inference=True).to(device) with torch.no_grad(): reconstructed = [] for j in range(video.size(0)): reconstructed.append(mymodel(video[j][None]).cpu().numpy()) reconstructed = np.concatenate(reconstructed) save_video(reconstructed, os.path.join("result", file_names[i], '_filled.avi'))
def generate_preferred_tmp(net, exec_code, channel=None, feature_mask=None, img_mean=(0, 0, 0), img_std=(1, 1, 1), norm=255, input_size=(224, 224, 3), bgr=False, feature_weight=1., initial_input=None, iter_n=200, lr_start=1., lr_end=1., momentum_start=0.001, momentum_end=0.001, decay_start=0.001, decay_end=0.001, grad_normalize=True, image_jitter=True, jitter_size=32, jitter_size_z=2, image_blur=True, sigma_xy_start=2.5, sigma_xy_end=0.5, sigma_t_start=0.01, sigma_t_end=0.002, use_p_norm_reg=False, p=2, lamda_start=0.5, lamda_end=0.5, use_TV_norm_reg=False, TVbeta1=2, TVbeta2=2, TVlamda_start_sp=0.5, TVlamda_end_sp=0.5, TVlamda_start_tmp=0.5, TVlamda_end_tmp=0.5, clip_extreme=False, clip_extreme_every=4, e_pct_start=1, e_pct_end=1, clip_small_norm=False, clip_small_norm_every=4, n_pct_start=5., n_pct_end=5., clip_small_contribution=False, clip_small_contribution_every=4, c_pct_start=5., c_pct_end=5., disp_every=1, save_intermediate=False, save_intermediate_every=1, save_intermediate_path=None): '''Generate preferred image/video for the target uints using gradient descent with momentum. Parameters ---------- net: torch.nn.Module CNN model coresponding to the target CNN features. feature_mask: ndarray The mask used to select the target units. The shape of the mask should be the same as that of the CNN features in that layer. The values of the mask array are binary, (1: target uint; 0: irrelevant unit) exec_code: list The code to extract intermidiate layer. This code is run in the 'get_cnn_feature' function img_mean: np.ndarray set the mean in rgb order to pre/de-process to input/output image/video img_std : np.ndarray set the std in rgb order to pre/de-process to input/output image/video input_size: np.ndarray the shape correspond to the CNN available input Optional Parameters ---------- feature_weight: float or ndarray The weight for each target unit. If it is scalar, the scalar will be used as the universal weight for all units. If it is numpy array, it allows to specify different weights for different uints. initial_input: ndarray Initial image for the optimization. Use random noise as initial image by setting to None. iter_n: int The total number of iterations. lr_start: float The learning rate at start of the optimization. The learning rate will linearly decrease from lr_start to lr_end during the optimization. lr_end: float The learning rate at end of the optimization. The learning rate will linearly decrease from lr_start to lr_end during the optimization. momentum_start: float The momentum (gradient descend with momentum) at start of the optimization. The momentum will linearly decrease from momentum_start to momentum_end during the optimization. momentum_end: float The momentum (gradient descend with momentum) at the end of the optimization. The momentum will linearly decrease from momentum_start to momentum_end during the optimization. decay_start: float The decay rate of the image pixels at start of the optimization. The decay rate will linearly decrease from decay_start to decay_end during the optimization. decay_end: float The decay rate of the image pixels at the end of the optimization. The decay rate will linearly decrease from decay_start to decay_end during the optimization. grad_normalize: bool Normalise the gradient or not for each iteration. image_jitter: bool Use image jittering or not. If true, randomly shift the intermediate reconstructed image for each iteration. jitter_size: int image jittering in number of pixels. image_blur: bool Use image smoothing or not. If true, smoothing the image for each iteration. sigma_start: float The size of the gaussian filter for image smoothing at start of the optimization. The sigma will linearly decrease from sigma_start to sigma_end during the optimization. sigma_end: float The size of the gaussian filter for image smoothing at the end of the optimization. The sigma will linearly decrease from sigma_start to sigma_end during the optimization. use_p_norm_reg: bool Use p-norm loss for image or not as regularization term. p: float The order of the p-norm loss of image lamda_start: float The weight for p-norm loss at start of the optimization. The lamda will linearly decrease from lamda_start to lamda_end during the optimization. lamda_end: float The weight for p-norm loss at the end of the optimization. The lamda will linearly decrease from lamda_start to lamda_end during the optimization. use_TV_norm_reg: bool Use TV-norm or not as regularization term. TVbeta: float The order of the TV-norm. TVlamda_start: float The weight for TV-norm regularization term at start of the optimization. The TVlamda will linearly decrease from TVlamda_start to TVlamda_end during the optimization. TVlamda_end: float The weight for TV-norm regularization term at the end of the optimization. The TVlamda will linearly decrease from TVlamda_start to TVlamda_end during the optimization. clip_extreme: bool Clip or not the pixels with extreme high or low value. clip_extreme_every: int Clip the pixels with extreme value every n iterations. e_pct_start: float the percentage of pixels to be clipped at start of the optimization. The percentage will linearly decrease from e_pct_start to e_pct_end during the optimization. e_pct_end: float the percentage of pixels to be clipped at the end of the optimization. The percentage will linearly decrease from e_pct_start to e_pct_end during the optimization. clip_small_norm: bool Clip or not the pixels with small norm of RGB valuse. clip_small_norm_every: int Clip the pixels with small norm every n iterations n_pct_start: float The percentage of pixels to be clipped at start of the optimization. The percentage will linearly decrease from n_pct_start to n_pct_end during the optimization. n_pct_end: float The percentage of pixels to be clipped at start of the optimization. The percentage will linearly decrease from n_pct_start to n_pct_end during the optimization. clip_small_contribution: bool Clip or not the pixels with small contribution: norm of RGB channels of (img*grad). clip_small_contribution_every: int Clip the pixels with small contribution every n iterations. c_pct_start: float The percentage of pixels to be clipped at start of the optimization. The percentage will linearly decrease from c_pct_start to c_pct_end during the optimization. c_pct_end: float The percentage of pixels to be clipped at the end of the optimization. The percentage will linearly decrease from c_pct_start to c_pct_end during the optimization. disp_every: int Display the optimization information for every n iterations. save_intermediate: bool Save the intermediate reconstruction or not. save_intermediate_every: int Save the intermediate reconstruction for every n iterations. save_intermediate_path: str The path to save the intermediate reconstruction. Returns ------- img: ndarray The preferred image/video same shape as input_size. ''' # make save dir if save_intermediate: if save_intermediate_path is None: save_intermediate_path = os.path.join( '.', 'preferred_gd_' + datetime.now().strftime('%Y%m%dT%H%M%S')) if not os.path.exists(save_intermediate_path): os.makedirs(save_intermediate_path, exist_ok=True) # initial input if initial_input is None: initial_input = np.random.randint(0, 256, (input_size)) else: input_size = initial_input.shape # image mean img_mean = img_mean img_std = img_std # image norm noise_vid = np.random.randint(0, 256, (input_size)) img_norm0 = np.linalg.norm(noise_vid) img_norm0 = img_norm0 / 2. if save_intermediate: if len(input_size) == 3: #image save_name = 'initial_video.jpg' if bgr: PIL.Image.fromarray(np.uint8( initial_input[..., [2, 1, 0]])).save( os.path.join(save_intermediate_path, save_name)) else: PIL.Image.fromarray(np.uint8(initial_input)).save( os.path.join(save_intermediate_path, save_name)) elif len(input_size) == 4: # video save_name = 'initial_video.avi' save_video(initial_input, save_name, save_intermediate_path, bgr) save_name = 'initial_video.gif' save_gif(initial_input, save_name, save_intermediate_path, bgr, fr_rate=150) else: print('Input size is not appropriate for save') assert len(input_size) not in [3, 4] # create feature mask if not define if feature_mask is None: feature_mask = create_feature_mask(net, exec_code, input_size, channel) # iteration for gradient descent init_input = initial_input.copy() if len(input_size) == 3: #Image input = img_preprocess(init_input, img_mean, img_std, norm) else: #Video input = vid_preprocess(init_input, img_mean, img_std, norm) delta_input = np.zeros_like(input) feat_grad = np.zeros_like(feature_mask) feat_grad[ feature_mask == 1] = -1. # here we use gradient descent, so the gradient is negative, in order to make the target units have high positive activation; feat_grad = feat_grad * feature_weight # Loss function (minus Loss) loss_fun = minusLoss() for t in range(iter_n): # parameters lr = lr_start + t * (lr_end - lr_start) / iter_n momentum = momentum_start + t * (momentum_end - momentum_start) / iter_n decay = decay_start + t * (decay_end - decay_start) / iter_n sigma_xy = sigma_xy_start + t * (sigma_xy_end - sigma_xy_start) / iter_n sigma_t = sigma_t_start + t * (sigma_t_end - sigma_t_start) / iter_n # shift if image_jitter: ox, oy = np.random.randint(-jitter_size, jitter_size + 1, 2) oz = np.random.randint(-jitter_size_z, jitter_size_z + 1, 1) input = np.roll(np.roll(np.roll(input, ox, -1), oy, -2), oz, -3) delta_input = np.roll( np.roll(np.roll(delta_input, ox, -1), oy, -2), oz, -3) # create Tensor input = torch.Tensor(input[np.newaxis]) input.requires_grad_() # forward fw = get_cnn_features(net, input, exec_code)[0] feat = torch.masked_select(fw, torch.ByteTensor(feature_mask)) feat_abs_mean = np.mean(np.abs(feat[0].detach().numpy())) #for the first time iteration, input.grad is None if input.grad is not None: input.grad.data.zero_() # zero grad net.zero_grad() # backward for net loss = loss_fun(feat) loss.backward() grad = input.grad.numpy() input = input.detach().numpy() # normalize gradient if grad_normalize: grad_mean = np.abs(grad).mean() if grad_mean > 0: grad = grad / grad_mean # gradient with momentum delta_input = delta_input * momentum + grad # p norm regularization if use_p_norm_reg: lamda = lamda_start + t * (lamda_end - lamda_start) / iter_n _, grad_r = p_norm(input, p) grad_r = grad_r / (img_norm0**2) if grad_normalize: grad_mean = np.abs(grad_r).mean() if grad_mean > 0: grad_r = grad_r / grad_mean delta_input = delta_input + lamda * grad_r # TV norm regularization if use_TV_norm_reg: TVlamda_sp = TVlamda_start_sp + t * (TVlamda_end_sp - TVlamda_start_sp) / iter_n if len(input_size) == 3: loss_r, grad_r = TV_norm(input, TVbeta1) loss_r = loss_r / (img_norm0**2) grad_r = grad_r / (img_norm0**2) if grad_normalize: grad_mean = np.abs(grad_r).mean() if grad_mean > 0: grad_r = grad_r / grad_mean delta_input = delta_input + TVlamda_sp * grad_r else: # spatial loss_r_sp, grad_r_sp = TV_norm_sp(input, TVbeta1) loss_r_sp = loss_r_sp / (img_norm0**2) grad_r_sp = grad_r_sp / (img_norm0**2) if grad_normalize: grad_mean_sp = np.abs(grad_r_sp).mean() if grad_mean > 0: grad_r_sp = grad_r_sp / grad_mean_sp # temporal TVlamda_tmp = TVlamda_start_tmp + t * ( TVlamda_end_tmp - TVlamda_start_tmp) / iter_n loss_r_tmp, grad_r_tmp = TV_norm_tmp(input, TVbeta2) loss_r_tmp = loss_r_tmp / (img_norm0**2) grad_r_tmmp = grad_r_tmp / (img_norm0**2) if grad_normalize: grad_mean_tmp = np.abs(grad_r_tmp).mean() if grad_mean > 0: grad_r_tmp = grad_r_tmp / grad_mean_tmp delta_input = delta_input + TVlamda_sp * grad_r_sp + TVlamda_tmp * grad_r_tmp # input update [0] means remove the newaxis input = np.add(input, -lr * delta_input, dtype=np.float32)[0] grad = grad[0] delta_input = delta_input[0] # clip pixels with extreme value if clip_extreme and (t + 1) % clip_extreme_every == 0: e_pct = e_pct_start + t * (e_pct_end - e_pct_start) / iter_n input = clip_extreme_pixel(input, e_pct) # clip pixels with small norm if clip_small_norm and (t + 1) % clip_small_norm_every == 0: n_pct = n_pct_start + t * (n_pct_end - n_pct_start) / iter_n input = clip_small_norm_pixel(input, n_pct) # clip pixels with small contribution if clip_small_contribution and ( t + 1) % clip_small_contribution_every == 0: c_pct = c_pct_start + t * (c_pct_end - c_pct_start) / iter_n input = clip_small_contribution_pixel(input, grad, c_pct) # unshift if image_jitter: input = np.roll(np.roll(np.roll(input, -ox, -1), -oy, -2), -oz, -3) delta_input = delta_input - grad delta_input = np.roll( np.roll(np.roll(delta_input, -ox, -1), -oy, -2), -oz, -3) delta_input = delta_input + grad # L_2 decay input = (1 - decay) * input # gaussian blur if image_blur: if len(input_size) == 3: input = gaussian_blur(input, sigma) else: input = gaussian_blur_vid(input, sigma_xy, sigma_t) # disp info if (t + 1) % disp_every == 0: print('iter=%d; mean(abs(feat))=%g;' % (t + 1, feat_abs_mean)) # save image if save_intermediate and ((t + 1) % save_intermediate_every == 0): if len(input_size) == 3: save_name = '%05d.jpg' % (t + 1) if bgr: PIL.Image.fromarray( normalise_img( img_deprocess(input, img_mean, img_std, norm)[..., [2, 1, 0]])).save( os.path.join( save_intermediate_path, save_name)) else: PIL.Image.fromarray( normalise_img( img_deprocess(input, img_mean, img_std, norm))).save( os.path.join( save_intermediate_path, save_name)) else: save_name = '%05d.avi' % (t + 1) save_video(normalise_vid( vid_deprocess(input, img_mean, img_std, norm)), save_name, save_intermediate_path, bgr, fr_rate=10) save_name = '%05d.gif' % (t + 1) save_gif(normalise_vid( vid_deprocess(input, img_mean, img_std, norm)), save_name, save_intermediate_path, bgr, fr_rate=150) # return input if len(input_size) == 3: return img_deprocess(input, img_mean, img_std, norm) else: return vid_deprocess(input, img_mean, img_std, norm)
cv2.imwrite( "translated_samples/" + os.path.basename(input_item), concat) elif ext in [".mp4", ".avi"]: cam = cv2.VideoCapture(input_item) fps = cam.get(cv2.CAP_PROP_FPS) frames = [] for orignal_image, translated_image in tqdm.tqdm( demo.run_on_video(cam)): if args.show_original: concat = np.concatenate( [orignal_image, translated_image], axis=1) else: concat = translated_image height, width = concat.shape[:2] resized = cv2.resize(concat, None, fx=0.5, fy=0.5, interpolation=cv2.INTER_AREA) frames.append(resized) cam.release() save_video( frames, "translated_samples/" + os.path.basename(input_item), fps)