Exemple #1
0
def run(vid_file,
        start_time,
        dur,
        pr,
        gpu,
        buf=0.05,
        mask=None,
        arg=None,
        net=None):
    print pr
    dur = dur + buf
    with ut.TmpDir() as vid_path:
        height_s = '-vf "scale=-2:\'min(%d,ih)\'"' % arg.max_full_height if arg.max_full_height > 0 else ''
        ut.sys_check(
            ut.frm(
                'ffmpeg -loglevel error -ss %(start_time)s -i "%(vid_file)s" -safe 0  '
                '-t %(dur)s -r %(pr.fps)s -vf scale=256:256 "%(vid_path)s/small_%%04d.png"'
            ))
        ut.sys_check(
            ut.frm(
                'ffmpeg -loglevel error -ss %(start_time)s -i "%(vid_file)s" -safe 0 '
                '-t %(dur)s -r %(pr.fps)s %(height_s)s "%(vid_path)s/full_%%04d.png"'
            ))
        ut.sys_check(
            ut.frm(
                'ffmpeg -loglevel error -ss %(start_time)s -i "%(vid_file)s" -safe 0  '
                '-t %(dur)s -ar %(pr.samp_sr)s -ac 2 "%(vid_path)s/sound.wav"')
        )

        if arg.fullres:
            fulls = map(
                ig.load,
                sorted(ut.glob(vid_path, 'full_*.png'))[:pr.sampled_frames])
            fulls = np.array(fulls)

        snd = sound.load_sound(pj(vid_path, 'sound.wav'))
        samples_orig = snd.normalized().samples
        samples_orig = samples_orig[:pr.num_samples]
        samples_src = samples_orig.copy()
        if samples_src.shape[0] < pr.num_samples:
            return None

        ims = map(ig.load, sorted(ut.glob(vid_path, 'small_*.png')))
        ims = np.array(ims)
        d = 224
        y = x = ims.shape[1] / 2 - d / 2
        ims = ims[:, y:y + d, x:x + d]
        ims = ims[:pr.sampled_frames]

        if mask == 'l':
            ims[:, :, :ims.shape[2] / 2] = 128
            if arg.fullres:
                fulls[:, :, :fulls.shape[2] / 2] = 128
        elif mask == 'r':
            ims[:, :, ims.shape[2] / 2:] = 128
            if arg.fullres:
                fulls[:, :, fulls.shape[2] / 2:] = 128
        elif mask is None:
            pass
        else:
            raise RuntimeError()

        samples_src = mu.normalize_rms_np(samples_src[None], pr.input_rms)[0]
        net.init()
        ret = net.predict(ims[None], samples_src[None])
        samples_pred_fg = ret['samples_pred_fg'][0][:, None]
        samples_pred_bg = ret['samples_pred_bg'][0][:, None]
        spec_pred_fg = ret['spec_pred_fg'][0]
        spec_pred_bg = ret['spec_pred_bg'][0]
        print spec_pred_bg.shape
        spec_mix = ret['spec_mix'][0]

        if arg.cam:
            cam, vis = find_cam(fulls, samples_orig, arg)
        else:
            if arg.fullres:
                vis = fulls
            else:
                vis = ims

        return dict(ims=vis,
                    samples_pred_fg=samples_pred_fg,
                    samples_pred_bg=samples_pred_bg,
                    samples_mix=ret['samples_mix'][0],
                    samples_src=samples_src,
                    spec_pred_fg=spec_pred_fg,
                    spec_pred_bg=spec_pred_bg,
                    spec_mix=spec_mix)
# a class activation map (CAM) for an input video, then saves a
# visualization of the CAM.

pr = shift_params.shift_v1()
model_file = '../results/nets/shift/net.tf-650000'
gpu = None

# uncomment for higher-resolution CAM (like the ones in the paper)
# pr = shift_params.cam_v1()
# model_file = '../results/nets/cam/net.tf-675000'

with ut.VidFrames('../data/crossfire.mp4', sound = True,
                  start_time = 0., end_time = pr.vid_dur + 2./30, fps = 29.97) \
                  as (im_files, snd_file):
    ims = np.array(map(ig.load, im_files))
    ims = ims[:pr.sampled_frames]
    snd = sound.load_sound(snd_file).normalized()
    samples = snd.samples[:pr.num_samples]
    # make a version of the net using the pretrained weights
    # (i.e. learned through self-supervision)
    clf = shift_net.NetClf(pr, model_file, gpu=gpu)
    # use the audio-visual net to compute a class activation map
    [cam] = clf.predict_cam_resize(ims[np.newaxis], samples[np.newaxis])
    # average the CAM over time and overlay it on the middle frame
    cam = np.abs(cam[0, :, :, :, 0])
    cam = np.mean(cam, 0)
    vis = sep_video.heatmap(ims[len(ims) / 2][np.newaxis],
                            cam[np.newaxis],
                            adapt=True)
    ig.save('../results/cam_example.png', vis[0])