def run(vid_file, start_time, dur, pr, gpu, buf=0.05, mask=None, arg=None, net=None): print pr dur = dur + buf with ut.TmpDir() as vid_path: height_s = '-vf "scale=-2:\'min(%d,ih)\'"' % arg.max_full_height if arg.max_full_height > 0 else '' ut.sys_check( ut.frm( 'ffmpeg -loglevel error -ss %(start_time)s -i "%(vid_file)s" -safe 0 ' '-t %(dur)s -r %(pr.fps)s -vf scale=256:256 "%(vid_path)s/small_%%04d.png"' )) ut.sys_check( ut.frm( 'ffmpeg -loglevel error -ss %(start_time)s -i "%(vid_file)s" -safe 0 ' '-t %(dur)s -r %(pr.fps)s %(height_s)s "%(vid_path)s/full_%%04d.png"' )) ut.sys_check( ut.frm( 'ffmpeg -loglevel error -ss %(start_time)s -i "%(vid_file)s" -safe 0 ' '-t %(dur)s -ar %(pr.samp_sr)s -ac 2 "%(vid_path)s/sound.wav"') ) if arg.fullres: fulls = map( ig.load, sorted(ut.glob(vid_path, 'full_*.png'))[:pr.sampled_frames]) fulls = np.array(fulls) snd = sound.load_sound(pj(vid_path, 'sound.wav')) samples_orig = snd.normalized().samples samples_orig = samples_orig[:pr.num_samples] samples_src = samples_orig.copy() if samples_src.shape[0] < pr.num_samples: return None ims = map(ig.load, sorted(ut.glob(vid_path, 'small_*.png'))) ims = np.array(ims) d = 224 y = x = ims.shape[1] / 2 - d / 2 ims = ims[:, y:y + d, x:x + d] ims = ims[:pr.sampled_frames] if mask == 'l': ims[:, :, :ims.shape[2] / 2] = 128 if arg.fullres: fulls[:, :, :fulls.shape[2] / 2] = 128 elif mask == 'r': ims[:, :, ims.shape[2] / 2:] = 128 if arg.fullres: fulls[:, :, fulls.shape[2] / 2:] = 128 elif mask is None: pass else: raise RuntimeError() samples_src = mu.normalize_rms_np(samples_src[None], pr.input_rms)[0] net.init() ret = net.predict(ims[None], samples_src[None]) samples_pred_fg = ret['samples_pred_fg'][0][:, None] samples_pred_bg = ret['samples_pred_bg'][0][:, None] spec_pred_fg = ret['spec_pred_fg'][0] spec_pred_bg = ret['spec_pred_bg'][0] print spec_pred_bg.shape spec_mix = ret['spec_mix'][0] if arg.cam: cam, vis = find_cam(fulls, samples_orig, arg) else: if arg.fullres: vis = fulls else: vis = ims return dict(ims=vis, samples_pred_fg=samples_pred_fg, samples_pred_bg=samples_pred_bg, samples_mix=ret['samples_mix'][0], samples_src=samples_src, spec_pred_fg=spec_pred_fg, spec_pred_bg=spec_pred_bg, spec_mix=spec_mix)
# a class activation map (CAM) for an input video, then saves a # visualization of the CAM. pr = shift_params.shift_v1() model_file = '../results/nets/shift/net.tf-650000' gpu = None # uncomment for higher-resolution CAM (like the ones in the paper) # pr = shift_params.cam_v1() # model_file = '../results/nets/cam/net.tf-675000' with ut.VidFrames('../data/crossfire.mp4', sound = True, start_time = 0., end_time = pr.vid_dur + 2./30, fps = 29.97) \ as (im_files, snd_file): ims = np.array(map(ig.load, im_files)) ims = ims[:pr.sampled_frames] snd = sound.load_sound(snd_file).normalized() samples = snd.samples[:pr.num_samples] # make a version of the net using the pretrained weights # (i.e. learned through self-supervision) clf = shift_net.NetClf(pr, model_file, gpu=gpu) # use the audio-visual net to compute a class activation map [cam] = clf.predict_cam_resize(ims[np.newaxis], samples[np.newaxis]) # average the CAM over time and overlay it on the middle frame cam = np.abs(cam[0, :, :, :, 0]) cam = np.mean(cam, 0) vis = sep_video.heatmap(ims[len(ims) / 2][np.newaxis], cam[np.newaxis], adapt=True) ig.save('../results/cam_example.png', vis[0])