コード例 #1
0
def save_videos(tensor: Union[torch.tensor, List[torch.Tensor]],
                fp: Union[Text, pathlib.Path, BinaryIO],
                format: Optional[str] = None,
                **kwargs) -> None:
    #print(2)
    os.makedirs('sample_frame_v2', exist_ok=True)
    s_size, channel, fr, h, w = tensor.shape
    f_name_base, fmt = fp.split('.')
    for f in range(fr):
        tensor_tmp = tensor[:, :, f]
        f_name = '-'.join([f_name_base, f'fr_{f}', f'.{fmt}'])
        #print(f_name)
        save_image(tensor_tmp, f_name, **kwargs)

    merge_list = []
    for f in range(fr):
        f_name = '-'.join([f_name_base, f'fr_{f}', f'.{fmt}'])
        ee = Image.open(f_name)
        merge_list.append(np.array(ee))

    merge_list = np.array(merge_list)

    f_name_base = f_name_base.replace('sample_frame_v2', 'sample_video_v2')
    os.makedirs('sample_video_v2', exist_ok=True)
    save_name = f_name_base + '.avi'

    save_video(merge_list, save_name, '.', bgr=False, fr_rate=16)
    save_name = f_name_base + '.gif'
    save_gif(merge_list, save_name, '.', bgr=False, fr_rate=60)

    #remove current  frame files
    shutil.rmtree('sample_frame_v2')
コード例 #2
0
ファイル: test.py プロジェクト: Cris-zj/DaSiamRPN
def main(imagedir, gtdir):
    # load net
    net_file = join(realpath(dirname(__file__)), 'SiamRPNBIG.model')
    net = SiamRPNBIG()
    net.load_state_dict(torch.load(net_file))
    net.eval().cuda()

    # warm up
    for i in range(10):
        net.temple(
            torch.autograd.Variable(torch.FloatTensor(1, 3, 127, 127)).cuda())
        net(torch.autograd.Variable(torch.FloatTensor(1, 3, 255, 255)).cuda())

    # start to track
    # get the first frame groundtruth
    gt_file = os.path.join(gtdir, 'gt.txt')
    with open(gt_file, 'r') as f:
        lines = f.readlines()
    gt = []
    for line in lines:
        line = line.split(' ')
        gt.append([int(float(x)) for x in line])
    init_bbox = gt[0]  # top-left x y,w,h
    target_pos, target_sz = rect_2_cxy_wh(
        init_bbox)  # top-left x y,w,h --> center x y,w,h

    image_list = glob.glob(os.path.join(imagedir, '*.jpg'))
    image_list.sort()
    im = cv2.imread(image_list[0])  # HxWxC

    state = SiamRPN_init(im, target_pos, target_sz, net)  # init tracker
    bboxes = []
    for i in range(1, len(gt)):
        im = cv2.imread(image_list[i])  # HxWxC
        state = SiamRPN_track(state, im)  # track
        res = cxy_wh_2_rect(
            state['target_pos'],
            state['target_sz'])  # center x y,w,h --> top-left x y,w,h
        bboxes.append(res.tolist())

    _, precision, precision_auc, iou = _compile_results(gt[1:], bboxes)
    print(' -- Precision ' + "(20 px)"  + ': ' + "%.2f" % precision +\
            ' -- Precision AUC: ' + "%.2f" % precision_auc + \
            ' -- IOU: ' + "%.2f" % iou + ' --')

    isSavebbox = True
    if isSavebbox:
        print('saving bbox...')
        res_bbox_file = os.path.join('results_bbox.json')
        json.dump(bboxes, open(res_bbox_file, 'w'), indent=2)

    isSavevideo = True
    if isSavevideo:
        print('saving video...')
        save_video(image_list, bboxes)
    print('done')
コード例 #3
0
ファイル: main_silent.py プロジェクト: YapengTian/CCOL-CVPR21
def output_visuals(vis_rows, batch_data, outputs, args):
    # fetch data and predictions
    mag_mix = batch_data['mag_mix']
    phase_mix = batch_data['phase_mix']
    frames = batch_data['frames']
    infos = batch_data['infos']

    pred_masks_ = outputs['pred_masks']
    gt_masks_ = outputs['gt_masks']
    mag_mix_ = outputs['mag_mix']
    weight_ = outputs['weight']

    # unwarp log scale
    N = args.num_mix  #-1
    B = mag_mix.size(0)
    pred_masks_linear = [None for n in range(N)]
    gt_masks_linear = [None for n in range(N)]
    for n in range(N):
        if args.log_freq:
            grid_unwarp = torch.from_numpy(
                warpgrid(B,
                         args.stft_frame // 2 + 1,
                         gt_masks_[0].size(3),
                         warp=False)).to(args.device)
            pred_masks_linear[n] = F.grid_sample(pred_masks_[n], grid_unwarp)
            gt_masks_linear[n] = F.grid_sample(gt_masks_[n], grid_unwarp)
        else:
            pred_masks_linear[n] = pred_masks_[n]
            gt_masks_linear[n] = gt_masks_[n]

    # convert into numpy
    mag_mix = mag_mix.numpy()
    mag_mix_ = mag_mix_.detach().cpu().numpy()
    phase_mix = phase_mix.numpy()
    weight_ = weight_.detach().cpu().numpy()
    for n in range(N):
        pred_masks_[n] = pred_masks_[n].detach().cpu().numpy()
        pred_masks_linear[n] = pred_masks_linear[n].detach().cpu().numpy()
        gt_masks_[n] = gt_masks_[n].detach().cpu().numpy()
        gt_masks_linear[n] = gt_masks_linear[n].detach().cpu().numpy()

        # threshold if binary mask
        if args.binary_mask:
            pred_masks_[n] = (pred_masks_[n] > args.mask_thres).astype(
                np.float32)
            pred_masks_linear[n] = (pred_masks_linear[n] >
                                    args.mask_thres).astype(np.float32)

    # loop over each sample
    for j in range(B):
        row_elements = []

        # video names
        prefix = []
        for n in range(N):
            prefix.append('-'.join(
                infos[n][0][j].split('/')[-2:]).split('.')[0])
        prefix = '+'.join(prefix)
        makedirs(os.path.join(args.vis, prefix))

        # save mixture
        mix_wav = istft_reconstruction(mag_mix[j, 0],
                                       phase_mix[j, 0],
                                       hop_length=args.stft_hop)
        mix_amp = magnitude2heatmap(mag_mix_[j, 0])
        weight = magnitude2heatmap(weight_[j, 0], log=False, scale=100.)
        filename_mixwav = os.path.join(prefix, 'mix.wav')
        filename_mixmag = os.path.join(prefix, 'mix.jpg')
        filename_weight = os.path.join(prefix, 'weight.jpg')
        imsave(os.path.join(args.vis, filename_mixmag), mix_amp[::-1, :, :])
        imsave(os.path.join(args.vis, filename_weight), weight[::-1, :])
        wavfile.write(os.path.join(args.vis, filename_mixwav), args.audRate,
                      mix_wav)
        row_elements += [{
            'text': prefix
        }, {
            'image': filename_mixmag,
            'audio': filename_mixwav
        }]

        # save each component
        preds_wav = [None for n in range(N)]
        for n in range(N):

            # GT and predicted audio recovery
            gt_mag = mag_mix[j, 0] * gt_masks_linear[n][j, 0]
            gt_wav = istft_reconstruction(gt_mag,
                                          phase_mix[j, 0],
                                          hop_length=args.stft_hop)
            pred_mag = mag_mix[j, 0] * pred_masks_linear[n][j, 0]
            preds_wav[n] = istft_reconstruction(pred_mag,
                                                phase_mix[j, 0],
                                                hop_length=args.stft_hop)

            # output masks
            filename_gtmask = os.path.join(prefix,
                                           'gtmask{}.jpg'.format(n + 1))
            filename_predmask = os.path.join(prefix,
                                             'predmask{}.jpg'.format(n + 1))
            gt_mask = (np.clip(gt_masks_[n][j, 0], 0, 1) * 255).astype(
                np.uint8)
            pred_mask = (np.clip(pred_masks_[n][j, 0], 0, 1) * 255).astype(
                np.uint8)
            imsave(os.path.join(args.vis, filename_gtmask), gt_mask[::-1, :])
            imsave(os.path.join(args.vis, filename_predmask),
                   pred_mask[::-1, :])

            # ouput spectrogram (log of magnitude, show colormap)
            filename_gtmag = os.path.join(prefix, 'gtamp{}.jpg'.format(n + 1))
            filename_predmag = os.path.join(prefix,
                                            'predamp{}.jpg'.format(n + 1))
            gt_mag = magnitude2heatmap(gt_mag)
            pred_mag = magnitude2heatmap(pred_mag)
            imsave(os.path.join(args.vis, filename_gtmag), gt_mag[::-1, :, :])
            imsave(os.path.join(args.vis, filename_predmag),
                   pred_mag[::-1, :, :])

            # output audio
            filename_gtwav = os.path.join(prefix, 'gt{}.wav'.format(n + 1))
            filename_predwav = os.path.join(prefix, 'pred{}.wav'.format(n + 1))
            wavfile.write(os.path.join(args.vis, filename_gtwav), args.audRate,
                          gt_wav)
            wavfile.write(os.path.join(args.vis, filename_predwav),
                          args.audRate, preds_wav[n])

            # output video
            frames_tensor = [
                recover_rgb(frames[n][j, :, t]) for t in range(args.num_frames)
            ]
            frames_tensor = np.asarray(frames_tensor)
            path_video = os.path.join(args.vis, prefix,
                                      'video{}.mp4'.format(n + 1))
            save_video(path_video,
                       frames_tensor,
                       fps=args.frameRate / args.stride_frames)

            # combine gt video and audio
            filename_av = os.path.join(prefix, 'av{}.mp4'.format(n + 1))
            combine_video_audio(path_video,
                                os.path.join(args.vis, filename_gtwav),
                                os.path.join(args.vis, filename_av))

            row_elements += [{
                'video': filename_av
            }, {
                'image': filename_predmag,
                'audio': filename_predwav
            }, {
                'image': filename_gtmag,
                'audio': filename_gtwav
            }, {
                'image': filename_predmask
            }, {
                'image': filename_gtmask
            }]

        row_elements += [{'image': filename_weight}]
        vis_rows.append(row_elements)
コード例 #4
0
                    default=-1,
                    help="frame height of output video")
args = parser.parse_args()

with open(args.cascade, "r") as f:
    xml = f.read()
stages, features, width, height = utils.parse_cascade(xml)

image = cv2.imread(args.image, 0)
image_height, image_width = image.shape[:2]

new_image_height = int(image_height * args.scale)
new_image_width = int(image_width * args.scale)

image_scaled = cv2.resize(image, (new_image_width, new_image_height),
                          interpolation=cv2.INTER_NEAREST)

t0 = time.time()
marked_images = utils.get_stage_images(image_scaled, stages, features, height,
                                       width, args.k)
t1 = time.time()
print(t1 - t0, "s")

if args.output_width == -1 or args.output_height == -1:
    args.output_width = new_image_width
    args.output_height = new_image_height

utils.save_video(args.output, marked_images, args.output_tps,
                 args.output_width, args.output_height)
print("saved output to", args.output)
コード例 #5
0
ファイル: icnn.py プロジェクト: ShuntaroAoki/pytorch_iCNN
def reconstruct_stim(features, net,
                     img_mean=np.array((0, 0, 0)).astype(np.float32),
                     img_std=np.array((1, 1, 1)).astype(np.float32),
                     norm=255,
                     bgr=False,
                     initial_input=None,
                     input_size=(224, 224, 3),
                     feature_masks=None,
                     layer_weight=None, channel=None, mask=None,
                     opt_name='SGD',
                     prehook_dict = {},
                     lr_start=0.02, lr_end=1e-12,
                      momentum_start=0.009, momentum_end=0.009,
                      decay_start=0.02, decay_end=1e-11,
                      grad_normalize = True,
                      image_jitter=False, jitter_size=4,
                      image_blur=True, sigma_start=2, sigma_end=0.5,
                      p=3, lamda=0.5,
                      TVlambda = [0,0],
                      clip_extreme=False, clip_extreme_every=4, e_pct_start=1, e_pct_end=1,
                      clip_small_norm=False, clip_small_norm_every=4, n_pct_start=5., n_pct_end=5.,

                     loss_type='l2', iter_n=200,  save_intermediate=False,
                     save_intermediate_every=1, save_intermediate_path=None,
                     disp_every=1,
                     ):
    if loss_type == "l2":
        loss_fun = torch.nn.MSELoss(reduction='sum')
    elif loss_type == "L2_with_reg":
        loss_fun = MSE_with_regulariztion(L_lambda=lamda, alpha=p, TV_lambda=TVlambda)
    else:
        assert loss_type + ' is not correct'
    # make save dir
    if save_intermediate:
        if save_intermediate_path is None:
            save_intermediate_path = os.path.join('..', 'recon_img_by_icnn' + datetime.now().strftime('%Y%m%dT%H%M%S'))
        if not os.path.exists(save_intermediate_path):
            os.makedirs(save_intermediate_path)

    # image size
    input_size = input_size

    # image mean
    img_mean = img_mean
    img_std = img_std
    norm = norm
    # image norm
    noise_img = np.random.randint(0, 256, (input_size))
    img_norm0 = np.linalg.norm(noise_img)
    img_norm0 = img_norm0/2.

    # initial input
    if initial_input is None:
        initial_input = np.random.randint(0, 256, (input_size))
    else:
        input_size = initial_input.shape

    if save_intermediate:
        if len(input_size) == 3:
            #image
            save_name = 'initial_image.jpg'
            if bgr:
                PIL.Image.fromarray(np.uint8(initial_input[...,[2,1,0]])).save(os.path.join(save_intermediate_path, save_name))
            else:
                PIL.Image.fromarray(np.uint8(initial_input)).save(os.path.join(save_intermediate_path, save_name))
        elif len(input_size) == 4:
            # video
            # if you install cv2 and ffmpeg, you can use save_video function which save preferred video as video format
            save_name = 'initial_video.avi'
            save_video(initial_input, save_name, save_intermediate_path, bgr)

            save_name = 'initial_video.gif'
            save_gif(initial_input, save_name, save_intermediate_path, bgr,
                     fr_rate=150)

        else:
            print('Input size is not appropriate for save')
            assert len(input_size) not in [3,4]


    # layer_list
    layer_dict = features
    layer_list = list(features.keys())

    # number of layers
    num_of_layer = len(layer_list)

    # layer weight
    if layer_weight is None:
        weights = np.ones(num_of_layer)
        weights = np.float32(weights)
        weights = weights / weights.sum()
        layer_weight = {}
        for j, layer in enumerate(layer_list):
            layer_weight[layer] = weights[j]

    # feature mask
    if feature_masks is None:
        feature_masks = create_feature_masks(layer_dict, masks=mask, channels=channel)

    # iteration for gradient descent
    input = initial_input.copy().astype(np.float32)
    if len(input_size) == 3:
        input = img_preprocess(input, img_mean, img_std, norm)
    else:
        input = vid_preprocess(input, img_mean, img_std, norm)

    loss_list = np.zeros(iter_n, dtype='float32')

    for t in range(iter_n):
        # parameters
        lr = lr_start + t * (lr_end - lr_start) / iter_n
        momentum = momentum_start + t * (momentum_end - momentum_start) / iter_n
        decay = decay_start + t * (decay_end - decay_start) / iter_n
        sigma = sigma_start + t * (sigma_end - sigma_start) / iter_n

        # shift
        if image_jitter:
            ox, oy = np.random.randint(-jitter_size, jitter_size+1, 2)
            input = np.roll(np.roll(input, ox, -1), oy, -2)

        # forward
        input = torch.tensor(input[np.newaxis], requires_grad=True)
        if opt_name == 'Adam':
            #op = optim.Adam([input], lr = lr)
            op = optim.Adam([input], lr = lr)
        elif opt_name == 'SGD':
            op = optim.SGD([input], lr=lr, momentum=momentum)
            #op = optim.SGD([input], lr=lr)
        elif opt_name == 'Adadelta':
            op = optim.Adadelta([input],lr = lr)
        elif opt_name == 'Adagrad':
            op = optim.Adagrad([input], lr = lr)
        elif opt_name == 'AdamW':
            op = optim.AdamW([input], lr = lr)
        elif opt_name == 'SparseAdam':
            op = optim.SparseAdam([input], lr = lr)
        elif opt_name == 'Adamax':
            op = optim.Adamax([input], lr = lr)
        elif opt_name == 'ASGD':
            op = optim.ASGD([input], lr = lr)

        elif opt_name == 'RMSprop':
            op = optim.RMSprop([input], lr = lr)
        elif opt_name == 'Rprop':
            op = optim.Rprop([input], lr = lr)
        fw = get_cnn_features(net, input, features.keys(), prehook_dict)
        # backward for net
        err = 0.
        loss = 0.
        # set the grad of network to 0
        net.zero_grad()
        op.zero_grad()
        for j in range(num_of_layer):

            # op.zero_grad()
            
            target_layer_id = num_of_layer -1 -j
            target_layer = layer_list[target_layer_id]
            # extract activation or mask at input true video, and mask
            act_j = fw[target_layer_id].clone()
            feat_j = features[target_layer].clone()
            mask_j = feature_masks[target_layer]

            layer_weight_j = layer_weight[target_layer]

            masked_act_j = torch.masked_select(act_j, torch.FloatTensor(mask_j).bool())
            masked_feat_j = torch.masked_select(feat_j, torch.FloatTensor(mask_j).bool())
            # calculate loss using pytorch loss function
            loss_j = loss_fun(masked_act_j, masked_feat_j) * layer_weight_j

            # backward the gradient to the video
            loss_j.backward(retain_graph=True)

            loss += loss_j.detach().numpy()
        if grad_normalize:
            grad_mean = torch.abs(input.grad).mean()
            if grad_mean > 0:
                input.grad /= grad_mean
        op.step()

        input = input.detach().numpy()[0]

        err = err + loss
        loss_list[t] = loss

        # clip pixels with extreme value
        if clip_extreme and (t+1) % clip_extreme_every == 0:
            e_pct = e_pct_start + t * (e_pct_end - e_pct_start) / iter_n
            input = clip_extreme_value(input, e_pct)

        # clip pixels with small norm
        if clip_small_norm and (t+1) % clip_small_norm_every == 0:
            n_pct = n_pct_start + t * (n_pct_end - n_pct_start) / iter_n
            input = clip_small_norm_value(input, n_pct)

        # unshift
        if image_jitter:
            input = np.roll(np.roll(input, -ox, -1), -oy, -2)

        # L_2 decay
        input = (1-decay) * input

        # gaussian blur
        if image_blur:
            if len(input_size) == 3:
                input = gaussian_blur(input, sigma)
            else:
                for i in range(input.shape[1]):
                    input[:, i] = gaussian_blur(input[:, i], sigma)

        # disp info
        if (t+1) % disp_every == 0:
            print('iter=%d; err=%g;' % (t+1, err))


        # save image
        if save_intermediate and ((t+1) % save_intermediate_every == 0):
            if len(input_size) == 3:
                save_name = '%05d.jpg' % (t+1)
                PIL.Image.fromarray(normalise_img(img_deprocess(input, img_mean, img_std, norm))).save(
                    os.path.join(save_intermediate_path, save_name))
            else:
                save_stim = input
                # if you install cv2 and ffmpeg, you can use save_video function which save preferred video as video format
                save_name = '%05d.avi' % (t + 1)
                save_video(normalise_vid(vid_deprocess(save_stim, img_mean, img_std, norm)), save_name,
                           save_intermediate_path, bgr, fr_rate=30)
                save_name = '%05d.gif' % (t + 1)
                save_gif(normalise_vid(vid_deprocess(save_stim, img_mean, img_std, norm)), save_name,
                         save_intermediate_path,
                         bgr, fr_rate=150)
    # return img
    if len(input_size) == 3:
        return img_deprocess(input, img_mean, img_std, norm), loss_list
    else:
        return vid_deprocess(input, img_mean, img_std, norm), loss_list
    parser.add_argument('--path',
                        default="inference",
                        type=str,
                        metavar='DIR',
                        help='path to get images')
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')
    args = parser.parse_args()
    file_names = sorted(os.listdir(args.path))

    mymodel = ResnetGenerator()
    mymodel.to(device)

    os.makedirs(os.path.join("result"), exist_ok=True)
    mymodel.load_state_dict(
        torch.load(os.path.join("model_weight", 'best_weight.pt'),
                   map_location=device)['G_state_dict'])

    mymodel.eval()

    for i in range(len(file_names)):
        video = read_video(os.path.join(args.path, file_names[i]),
                           inference=True).to(device)
        with torch.no_grad():
            reconstructed = []
            for j in range(video.size(0)):
                reconstructed.append(mymodel(video[j][None]).cpu().numpy())
        reconstructed = np.concatenate(reconstructed)
        save_video(reconstructed,
                   os.path.join("result", file_names[i], '_filled.avi'))
コード例 #7
0
def generate_preferred_tmp(net,
                           exec_code,
                           channel=None,
                           feature_mask=None,
                           img_mean=(0, 0, 0),
                           img_std=(1, 1, 1),
                           norm=255,
                           input_size=(224, 224, 3),
                           bgr=False,
                           feature_weight=1.,
                           initial_input=None,
                           iter_n=200,
                           lr_start=1.,
                           lr_end=1.,
                           momentum_start=0.001,
                           momentum_end=0.001,
                           decay_start=0.001,
                           decay_end=0.001,
                           grad_normalize=True,
                           image_jitter=True,
                           jitter_size=32,
                           jitter_size_z=2,
                           image_blur=True,
                           sigma_xy_start=2.5,
                           sigma_xy_end=0.5,
                           sigma_t_start=0.01,
                           sigma_t_end=0.002,
                           use_p_norm_reg=False,
                           p=2,
                           lamda_start=0.5,
                           lamda_end=0.5,
                           use_TV_norm_reg=False,
                           TVbeta1=2,
                           TVbeta2=2,
                           TVlamda_start_sp=0.5,
                           TVlamda_end_sp=0.5,
                           TVlamda_start_tmp=0.5,
                           TVlamda_end_tmp=0.5,
                           clip_extreme=False,
                           clip_extreme_every=4,
                           e_pct_start=1,
                           e_pct_end=1,
                           clip_small_norm=False,
                           clip_small_norm_every=4,
                           n_pct_start=5.,
                           n_pct_end=5.,
                           clip_small_contribution=False,
                           clip_small_contribution_every=4,
                           c_pct_start=5.,
                           c_pct_end=5.,
                           disp_every=1,
                           save_intermediate=False,
                           save_intermediate_every=1,
                           save_intermediate_path=None):
    '''Generate preferred image/video for the target uints using gradient descent with momentum.

        Parameters
        ----------
        net: torch.nn.Module
            CNN model coresponding to the target CNN features.

        feature_mask: ndarray
            The mask used to select the target units.
            The shape of the mask should be the same as that of the CNN features in that layer.
            The values of the mask array are binary, (1: target uint; 0: irrelevant unit)

        exec_code: list
           The code to extract intermidiate layer. This code is run in the 'get_cnn_feature' function
        img_mean: np.ndarray
            set the mean in rgb order to pre/de-process to input/output image/video
        img_std : np.ndarray
            set the std in rgb order to pre/de-process to input/output image/video

        input_size: np.ndarray
            the shape correspond to the CNN available input
        Optional Parameters
        ----------
        feature_weight: float or ndarray
            The weight for each target unit.
            If it is scalar, the scalar will be used as the universal weight for all units.
            If it is numpy array, it allows to specify different weights for different uints.
        initial_input: ndarray
            Initial image for the optimization.
            Use random noise as initial image by setting to None.
        iter_n: int
            The total number of iterations.
        lr_start: float
            The learning rate at start of the optimization.
            The learning rate will linearly decrease from lr_start to lr_end during the optimization.
        lr_end: float
            The learning rate at end of the optimization.
            The learning rate will linearly decrease from lr_start to lr_end during the optimization.
        momentum_start: float
            The momentum (gradient descend with momentum) at start of the optimization.
            The momentum will linearly decrease from momentum_start to momentum_end during the optimization.
        momentum_end: float
            The momentum (gradient descend with momentum) at the end of the optimization.
            The momentum will linearly decrease from momentum_start to momentum_end during the optimization.
        decay_start: float
            The decay rate of the image pixels at start of the optimization.
            The decay rate will linearly decrease from decay_start to decay_end during the optimization.
        decay_end: float
            The decay rate of the image pixels at the end of the optimization.
            The decay rate will linearly decrease from decay_start to decay_end during the optimization.
        grad_normalize: bool
            Normalise the gradient or not for each iteration.
        image_jitter: bool
            Use image jittering or not.
            If true, randomly shift the intermediate reconstructed image for each iteration.
        jitter_size: int
            image jittering in number of pixels.
        image_blur: bool
            Use image smoothing or not.
            If true, smoothing the image for each iteration.
        sigma_start: float
            The size of the gaussian filter for image smoothing at start of the optimization.
            The sigma will linearly decrease from sigma_start to sigma_end during the optimization.
        sigma_end: float
            The size of the gaussian filter for image smoothing at the end of the optimization.
            The sigma will linearly decrease from sigma_start to sigma_end during the optimization.
        use_p_norm_reg: bool
            Use p-norm loss for image or not as regularization term.
        p: float
            The order of the p-norm loss of image
        lamda_start: float
            The weight for p-norm loss at start of the optimization.
            The lamda will linearly decrease from lamda_start to lamda_end during the optimization.
        lamda_end: float
            The weight for p-norm loss at the end of the optimization.
            The lamda will linearly decrease from lamda_start to lamda_end during the optimization.
        use_TV_norm_reg: bool
            Use TV-norm or not as regularization term.
        TVbeta: float
            The order of the TV-norm.
        TVlamda_start: float
            The weight for TV-norm regularization term at start of the optimization.
            The TVlamda will linearly decrease from TVlamda_start to TVlamda_end during the optimization.
        TVlamda_end: float
            The weight for TV-norm regularization term at the end of the optimization.
            The TVlamda will linearly decrease from TVlamda_start to TVlamda_end during the optimization.
        clip_extreme: bool
            Clip or not the pixels with extreme high or low value.
        clip_extreme_every: int
            Clip the pixels with extreme value every n iterations.
        e_pct_start: float
            the percentage of pixels to be clipped at start of the optimization.
            The percentage will linearly decrease from e_pct_start to e_pct_end during the optimization.
        e_pct_end: float
            the percentage of pixels to be clipped at the end of the optimization.
            The percentage will linearly decrease from e_pct_start to e_pct_end during the optimization.
        clip_small_norm: bool
            Clip or not the pixels with small norm of RGB valuse.
        clip_small_norm_every: int
            Clip the pixels with small norm every n iterations
        n_pct_start: float
            The percentage of pixels to be clipped at start of the optimization.
            The percentage will linearly decrease from n_pct_start to n_pct_end during the optimization.
        n_pct_end: float
            The percentage of pixels to be clipped at start of the optimization.
            The percentage will linearly decrease from n_pct_start to n_pct_end during the optimization.
        clip_small_contribution: bool
            Clip or not the pixels with small contribution: norm of RGB channels of (img*grad).
        clip_small_contribution_every: int
            Clip the pixels with small contribution every n iterations.
        c_pct_start: float
            The percentage of pixels to be clipped at start of the optimization.
            The percentage will linearly decrease from c_pct_start to c_pct_end during the optimization.
        c_pct_end: float
            The percentage of pixels to be clipped at the end of the optimization.
            The percentage will linearly decrease from c_pct_start to c_pct_end during the optimization.
        disp_every: int
            Display the optimization information for every n iterations.
        save_intermediate: bool
            Save the intermediate reconstruction or not.
        save_intermediate_every: int
            Save the intermediate reconstruction for every n iterations.
        save_intermediate_path: str
            The path to save the intermediate reconstruction.

        Returns
        -------
        img: ndarray
            The preferred image/video same shape as input_size.

     '''

    # make save dir
    if save_intermediate:
        if save_intermediate_path is None:
            save_intermediate_path = os.path.join(
                '.',
                'preferred_gd_' + datetime.now().strftime('%Y%m%dT%H%M%S'))
        if not os.path.exists(save_intermediate_path):
            os.makedirs(save_intermediate_path, exist_ok=True)

    # initial input
    if initial_input is None:
        initial_input = np.random.randint(0, 256, (input_size))
    else:
        input_size = initial_input.shape
    # image mean
    img_mean = img_mean
    img_std = img_std
    # image norm
    noise_vid = np.random.randint(0, 256, (input_size))
    img_norm0 = np.linalg.norm(noise_vid)
    img_norm0 = img_norm0 / 2.

    if save_intermediate:
        if len(input_size) == 3:
            #image
            save_name = 'initial_video.jpg'
            if bgr:
                PIL.Image.fromarray(np.uint8(
                    initial_input[..., [2, 1, 0]])).save(
                        os.path.join(save_intermediate_path, save_name))
            else:
                PIL.Image.fromarray(np.uint8(initial_input)).save(
                    os.path.join(save_intermediate_path, save_name))
        elif len(input_size) == 4:
            # video
            save_name = 'initial_video.avi'
            save_video(initial_input, save_name, save_intermediate_path, bgr)

            save_name = 'initial_video.gif'
            save_gif(initial_input,
                     save_name,
                     save_intermediate_path,
                     bgr,
                     fr_rate=150)

        else:
            print('Input size is not appropriate for save')
            assert len(input_size) not in [3, 4]

    # create feature mask if not define
    if feature_mask is None:
        feature_mask = create_feature_mask(net, exec_code, input_size, channel)

    # iteration for gradient descent
    init_input = initial_input.copy()
    if len(input_size) == 3:
        #Image
        input = img_preprocess(init_input, img_mean, img_std, norm)
    else:
        #Video
        input = vid_preprocess(init_input, img_mean, img_std, norm)
    delta_input = np.zeros_like(input)
    feat_grad = np.zeros_like(feature_mask)
    feat_grad[
        feature_mask ==
        1] = -1.  # here we use gradient descent, so the gradient is negative, in order to make the target units have high positive activation;
    feat_grad = feat_grad * feature_weight

    # Loss function (minus Loss)
    loss_fun = minusLoss()

    for t in range(iter_n):

        # parameters
        lr = lr_start + t * (lr_end - lr_start) / iter_n
        momentum = momentum_start + t * (momentum_end -
                                         momentum_start) / iter_n
        decay = decay_start + t * (decay_end - decay_start) / iter_n
        sigma_xy = sigma_xy_start + t * (sigma_xy_end -
                                         sigma_xy_start) / iter_n
        sigma_t = sigma_t_start + t * (sigma_t_end - sigma_t_start) / iter_n

        # shift
        if image_jitter:
            ox, oy = np.random.randint(-jitter_size, jitter_size + 1, 2)
            oz = np.random.randint(-jitter_size_z, jitter_size_z + 1, 1)
            input = np.roll(np.roll(np.roll(input, ox, -1), oy, -2), oz, -3)
            delta_input = np.roll(
                np.roll(np.roll(delta_input, ox, -1), oy, -2), oz, -3)
        # create Tensor
        input = torch.Tensor(input[np.newaxis])
        input.requires_grad_()
        # forward
        fw = get_cnn_features(net, input, exec_code)[0]

        feat = torch.masked_select(fw, torch.ByteTensor(feature_mask))
        feat_abs_mean = np.mean(np.abs(feat[0].detach().numpy()))

        #for the first time iteration, input.grad is None
        if input.grad is not None:
            input.grad.data.zero_()
        # zero grad
        net.zero_grad()

        # backward for net
        loss = loss_fun(feat)
        loss.backward()

        grad = input.grad.numpy()
        input = input.detach().numpy()

        # normalize gradient
        if grad_normalize:
            grad_mean = np.abs(grad).mean()
            if grad_mean > 0:
                grad = grad / grad_mean

        # gradient with momentum
        delta_input = delta_input * momentum + grad

        # p norm regularization
        if use_p_norm_reg:
            lamda = lamda_start + t * (lamda_end - lamda_start) / iter_n
            _, grad_r = p_norm(input, p)
            grad_r = grad_r / (img_norm0**2)
            if grad_normalize:
                grad_mean = np.abs(grad_r).mean()
                if grad_mean > 0:
                    grad_r = grad_r / grad_mean
            delta_input = delta_input + lamda * grad_r

        # TV norm regularization
        if use_TV_norm_reg:
            TVlamda_sp = TVlamda_start_sp + t * (TVlamda_end_sp -
                                                 TVlamda_start_sp) / iter_n
            if len(input_size) == 3:
                loss_r, grad_r = TV_norm(input, TVbeta1)
                loss_r = loss_r / (img_norm0**2)
                grad_r = grad_r / (img_norm0**2)
                if grad_normalize:
                    grad_mean = np.abs(grad_r).mean()
                    if grad_mean > 0:
                        grad_r = grad_r / grad_mean
                delta_input = delta_input + TVlamda_sp * grad_r

            else:
                # spatial
                loss_r_sp, grad_r_sp = TV_norm_sp(input, TVbeta1)
                loss_r_sp = loss_r_sp / (img_norm0**2)
                grad_r_sp = grad_r_sp / (img_norm0**2)
                if grad_normalize:
                    grad_mean_sp = np.abs(grad_r_sp).mean()
                    if grad_mean > 0:
                        grad_r_sp = grad_r_sp / grad_mean_sp

                # temporal
                TVlamda_tmp = TVlamda_start_tmp + t * (
                    TVlamda_end_tmp - TVlamda_start_tmp) / iter_n
                loss_r_tmp, grad_r_tmp = TV_norm_tmp(input, TVbeta2)
                loss_r_tmp = loss_r_tmp / (img_norm0**2)
                grad_r_tmmp = grad_r_tmp / (img_norm0**2)
                if grad_normalize:
                    grad_mean_tmp = np.abs(grad_r_tmp).mean()
                    if grad_mean > 0:
                        grad_r_tmp = grad_r_tmp / grad_mean_tmp

                delta_input = delta_input + TVlamda_sp * grad_r_sp + TVlamda_tmp * grad_r_tmp

        # input update [0] means remove the newaxis
        input = np.add(input, -lr * delta_input, dtype=np.float32)[0]
        grad = grad[0]
        delta_input = delta_input[0]
        # clip pixels with extreme value
        if clip_extreme and (t + 1) % clip_extreme_every == 0:
            e_pct = e_pct_start + t * (e_pct_end - e_pct_start) / iter_n
            input = clip_extreme_pixel(input, e_pct)

        # clip pixels with small norm
        if clip_small_norm and (t + 1) % clip_small_norm_every == 0:
            n_pct = n_pct_start + t * (n_pct_end - n_pct_start) / iter_n
            input = clip_small_norm_pixel(input, n_pct)

        # clip pixels with small contribution
        if clip_small_contribution and (
                t + 1) % clip_small_contribution_every == 0:
            c_pct = c_pct_start + t * (c_pct_end - c_pct_start) / iter_n
            input = clip_small_contribution_pixel(input, grad, c_pct)

        # unshift
        if image_jitter:
            input = np.roll(np.roll(np.roll(input, -ox, -1), -oy, -2), -oz, -3)
            delta_input = delta_input - grad
            delta_input = np.roll(
                np.roll(np.roll(delta_input, -ox, -1), -oy, -2), -oz, -3)
            delta_input = delta_input + grad

        # L_2 decay
        input = (1 - decay) * input

        # gaussian blur
        if image_blur:
            if len(input_size) == 3:
                input = gaussian_blur(input, sigma)
            else:
                input = gaussian_blur_vid(input, sigma_xy, sigma_t)

        # disp info
        if (t + 1) % disp_every == 0:
            print('iter=%d; mean(abs(feat))=%g;' % (t + 1, feat_abs_mean))

        # save image
        if save_intermediate and ((t + 1) % save_intermediate_every == 0):
            if len(input_size) == 3:
                save_name = '%05d.jpg' % (t + 1)
                if bgr:
                    PIL.Image.fromarray(
                        normalise_img(
                            img_deprocess(input, img_mean, img_std,
                                          norm)[..., [2, 1, 0]])).save(
                                              os.path.join(
                                                  save_intermediate_path,
                                                  save_name))
                else:
                    PIL.Image.fromarray(
                        normalise_img(
                            img_deprocess(input, img_mean, img_std,
                                          norm))).save(
                                              os.path.join(
                                                  save_intermediate_path,
                                                  save_name))

            else:
                save_name = '%05d.avi' % (t + 1)
                save_video(normalise_vid(
                    vid_deprocess(input, img_mean, img_std, norm)),
                           save_name,
                           save_intermediate_path,
                           bgr,
                           fr_rate=10)
                save_name = '%05d.gif' % (t + 1)
                save_gif(normalise_vid(
                    vid_deprocess(input, img_mean, img_std, norm)),
                         save_name,
                         save_intermediate_path,
                         bgr,
                         fr_rate=150)

    # return input
    if len(input_size) == 3:
        return img_deprocess(input, img_mean, img_std, norm)
    else:
        return vid_deprocess(input, img_mean, img_std, norm)
コード例 #8
0
                cv2.imwrite(
                    "translated_samples/" + os.path.basename(input_item),
                    concat)

            elif ext in [".mp4", ".avi"]:
                cam = cv2.VideoCapture(input_item)
                fps = cam.get(cv2.CAP_PROP_FPS)

                frames = []
                for orignal_image, translated_image in tqdm.tqdm(
                        demo.run_on_video(cam)):
                    if args.show_original:
                        concat = np.concatenate(
                            [orignal_image, translated_image], axis=1)
                    else:
                        concat = translated_image
                    height, width = concat.shape[:2]
                    resized = cv2.resize(concat,
                                         None,
                                         fx=0.5,
                                         fy=0.5,
                                         interpolation=cv2.INTER_AREA)
                    frames.append(resized)

                cam.release()

                save_video(
                    frames,
                    "translated_samples/" + os.path.basename(input_item), fps)