def frame_extraction(video_path, short_side): """Extract frames given video_path. Args: video_path (str): The video_path. """ # Load the video, extract frames into ./tmp/video_name target_dir = osp.join('./tmp', osp.basename(osp.splitext(video_path)[0])) os.makedirs(target_dir, exist_ok=True) # Should be able to handle videos up to several hours frame_tmpl = osp.join(target_dir, 'img_{:06d}.jpg') vid = cv2.VideoCapture(video_path) frames = [] frame_paths = [] flag, frame = vid.read() cnt = 0 new_h, new_w = None, None while flag: if new_h is None: h, w, _ = frame.shape new_w, new_h = mmcv.rescale_size((w, h), (short_side, np.Inf)) frame = mmcv.imresize(frame, (new_w, new_h)) frames.append(frame) frame_path = frame_tmpl.format(cnt + 1) frame_paths.append(frame_path) cv2.imwrite(frame_path, frame) cnt += 1 flag, frame = vid.read() return frame_paths, frames
def rescale(self, scale, interpolation=None): """see :func:`BaseInstanceMasks.rescale`""" new_w, new_h = mmcv.rescale_size((self.width, self.height), scale) if len(self.masks) == 0: rescaled_masks = PolygonMasks([], new_h, new_w) else: rescaled_masks = self.resize((new_h, new_w)) return rescaled_masks
def rescale(self, scale, interpolation='nearest'): """See :func:`BaseInstanceMasks.rescale`.""" if len(self.masks) == 0: new_w, new_h = mmcv.rescale_size((self.width, self.height), scale) rescaled_masks = np.empty((0, new_h, new_w), dtype=np.uint8) else: rescaled_masks = np.stack([ mmcv.imrescale(mask, scale, interpolation=interpolation) for mask in self.masks ]) height, width = rescaled_masks.shape[1:] return BitmapMasks(rescaled_masks, height, width)
def _resize_frames(frame_list, scale, keep_ratio=True, interpolation='bilinear'): """resize frames according to given scale. Codes are modified from `mmaction2/datasets/pipelines/augmentation.py`, `Resize` class. Args: frame_list (list[np.ndarray]): frames to be resized. scale (tuple[int]): If keep_ratio is True, it serves as scaling factor or maximum size: the image will be rescaled as large as possible within the scale. Otherwise, it serves as (w, h) of output size. keep_ratio (bool): If set to True, Images will be resized without changing the aspect ratio. Otherwise, it will resize images to a given size. Default: True. interpolation (str): Algorithm used for interpolation: "nearest" | "bilinear". Default: "bilinear". Returns: list[np.ndarray]: Both GradCAM and Recognizer test stage inputs, including two keys, ``imgs`` and ``label``. """ if scale is None or (scale[0] == -1 and scale[1] == -1): return frame_list scale = tuple(scale) max_long_edge = max(scale) max_short_edge = min(scale) if max_short_edge == -1: scale = (np.inf, max_long_edge) img_h, img_w, _ = frame_list[0].shape if keep_ratio: new_w, new_h = mmcv.rescale_size((img_w, img_h), scale) else: new_w, new_h = scale frame_list = [ mmcv.imresize(img, (new_w, new_h), interpolation=interpolation) for img in frame_list ] return frame_list
def _resize_img(self, results): for key in results.get('img_fields', ['img']): h, w = results[key].shape[:2] dw = w * self.jitter dh = h * self.jitter new_ar = (w + np.random.uniform(-dw, dw)) / (h + np.random.uniform(-dh, -dh)) w = h * new_ar if self.keep_ratio: scale = mmcv.rescale_size((w, h), results['scale']) else: scale = results['scale'] img, w_scale, h_scale = mmcv.imresize(results[key], scale, return_scale=True) scale_factor = np.array([w_scale, h_scale, w_scale, h_scale], dtype=np.float32) results[key] = img results['img_shape'] = img.shape results['pad_shape'] = img.shape # in case that there is no padding results['scale_factor'] = scale_factor results['keep_ratio'] = self.keep_ratio
def __call__(self, results): w, h = results['img_info']['width'], results['img_info']['height'] if self.keep_ratio: (new_w, new_h) = rescale_size((w, h), self.img_scale, return_scale=False) w_scale = new_w / w h_scale = new_h / h else: (new_w, new_h) = self.img_scale w_scale = new_w / w h_scale = new_h / h scale_factor = np.array([w_scale, h_scale, w_scale, h_scale], dtype=np.float32) results['img_shape'] = (new_h, new_w, 1) results['scale_factor'] = scale_factor results['keep_ratio'] = True return results
def __call__(self, results): """Performs the ResizeWithBox augmentation. Args: results (dict): The resulting dict to be modified and passed to the next transform in pipeline. """ if 'scale_factor' not in results: results['scale_factor'] = np.array([1, 1], dtype=np.float32) img_h, img_w = results['img_shape'] if self.keep_ratio: new_w, new_h = mmcv.rescale_size((img_w, img_h), self.scale) else: new_w, new_h = self.scale self.scale_factor = np.array([new_w / img_w, new_h / img_h], dtype=np.float32) results['img_shape'] = (new_h, new_w) results['keep_ratio'] = self.keep_ratio results['scale_factor'] = results['scale_factor'] * self.scale_factor if not self.lazy: results['imgs'] = [ mmcv.imresize(img, (new_w, new_h), interpolation=self.interpolation) for img in results['imgs'] ] for idx in range(len(results['detections'])): cur_detections = results['detections'][idx] cur_detections[:, 0::2] = np.clip( cur_detections[:, 0::2] * self.scale_factor[0], 0, new_w) cur_detections[:, 1::2] = np.clip( cur_detections[:, 1::2] * self.scale_factor[1], 0, new_h) results['detections'][idx] = cur_detections else: raise NotImplementedError return results
def rescale(self, scale, interpolation='nearest'): """Rescale masks as large as possible while keeping the aspect ratio. For details can refer to `mmcv.imrescale` Args: scale (tuple[int]): the maximum size (h, w) of rescaled mask interpolation (str): same as :func:`mmcv.imrescale` Returns: BitmapMasks: the rescaled masks """ if len(self.masks) == 0: new_w, new_h = mmcv.rescale_size((self.width, self.height), scale) rescaled_masks = np.empty((0, new_h, new_w), dtype=np.uint8) else: rescaled_masks = np.stack([ mmcv.imrescale(mask, scale, interpolation=interpolation) for mask in self.masks ]) height, width = rescaled_masks.shape[1:] return BitmapMasks(rescaled_masks, height, width)
def __call__(self, results): """Performs the Resize augmentation. Args: results (dict): The resulting dict to be modified and passed to the next transform in pipeline. """ _init_lazy_if_proper(results, self.lazy) if 'scale_factor' not in results: results['scale_factor'] = np.array([1, 1], dtype=np.float32) img_h, img_w = results['img_shape'] if self.keep_ratio: new_w, new_h = mmcv.rescale_size((img_w, img_h), self.scale) else: new_w, new_h = self.scale self.scale_factor = np.array([new_w / img_w, new_h / img_h], dtype=np.float32) results['img_shape'] = (new_h, new_w) results['keep_ratio'] = self.keep_ratio results['scale_factor'] = results['scale_factor'] * self.scale_factor if not self.lazy: results['imgs'] = [ mmcv.imresize(img, (new_w, new_h), interpolation=self.interpolation) for img in results['imgs'] ] else: lazyop = results['lazy'] if lazyop['flip']: raise NotImplementedError('Put Flip at last for now') lazyop['interpolation'] = self.interpolation return results
def __call__(self, frames, proposals): frame_w, frame_h = frames[0].shape[1], frames[0].shape[0] new_w, new_h = mmcv.rescale_size((frame_w, frame_h), (256, np.Inf)) w_ratio, h_ratio = new_w / frame_w, new_h / frame_h frames = [mmcv.imresize(img, (new_w, new_h)) for img in frames] _ = [mmcv.imnormalize_(frame, **self.img_norm_cfg) for frame in frames] # THWC -> CTHW -> 1CTHW input_array = np.stack(frames).transpose((3, 0, 1, 2))[np.newaxis] input_tensor = torch.from_numpy(input_array).to(self.device) proposal = proposals[len(proposals) // 2] proposal = torch.from_numpy(proposal[:, :4]).to(self.device) if proposal.shape[0] == 0: return None proposal[:, 0:4:2] *= w_ratio proposal[:, 1:4:2] *= h_ratio with torch.no_grad(): result = self.model(return_loss=False, img=[input_tensor], img_metas=[[dict(img_shape=(new_h, new_w))]], proposals=[[proposal]]) return self.post_proce(result, proposal)
def test_rescale_size(self): new_size, scale_factor = mmcv.rescale_size((400, 300), 1.5, True) assert new_size == (600, 450) and scale_factor == 1.5 new_size, scale_factor = mmcv.rescale_size((400, 300), 0.934, True) assert new_size == (374, 280) and scale_factor == 0.934 new_size = mmcv.rescale_size((400, 300), 1.5) assert new_size == (600, 450) new_size = mmcv.rescale_size((400, 300), 0.934) assert new_size == (374, 280) new_size, scale_factor = mmcv.rescale_size((400, 300), (1000, 600), True) assert new_size == (800, 600) and scale_factor == 2.0 new_size, scale_factor = mmcv.rescale_size((400, 300), (180, 200), True) assert new_size == (200, 150) and scale_factor == 0.5 new_size = mmcv.rescale_size((400, 300), (1000, 600)) assert new_size == (800, 600) new_size = mmcv.rescale_size((400, 300), (180, 200)) assert new_size == (200, 150) with pytest.raises(ValueError): mmcv.rescale_size((400, 300), -0.5) with pytest.raises(TypeError): mmcv.rescale_size()((400, 300), [100, 100])
def main(): args = parse_args() # frame_paths, original_frames = frame_extraction(args.video) video_pathes = os.listdir(args.video) # frame_paths = sorted([osp.join(osp.join(args.video, video_base_path), x) for video_base_path in video_pathes for x in os.listdir(osp.join(args.video, video_base_path)) ]) # single folder # video_path=args.video # frame_paths = sorted([osp.join(video_path, x) for x in os.listdir(video_path)]) for video_base_path in video_pathes: video_path = osp.join(args.video, video_base_path) frame_paths = sorted( [osp.join(video_path, x) for x in os.listdir(video_path)]) # original_frames = [] # for x in os.listdir(video_path): # frame=cv2.imread(osp.join(video_path, x)) # original_frames.append(frame) # num_frame = len(frame_paths) frame = cv2.imread(frame_paths[0]) h, w, _ = frame.shape # Load label_map # label_map = load_label_map(args.label_map) # resize frames to shortside 256 new_w, new_h = mmcv.rescale_size((w, h), (1800, np.Inf)) # frames = [mmcv.imresize(img, (new_w, new_h)) for img in original_frames] w_ratio, h_ratio = new_w / w, new_h / h human_detections = detection_inference(args, frame_paths) for i in range(len(human_detections)): det = human_detections[i] det[:, 0:4:2] *= w_ratio det[:, 1:4:2] *= h_ratio human_detections[i] = torch.from_numpy(det[:, :4]).to(args.device) results_total = [] for human_detection in human_detections: human_detection[:, 0::2] /= new_w human_detection[:, 1::2] /= new_h results = [] for prop in human_detection: results.append((prop.data.cpu().numpy())) results_total.append(results) # xml target_dir = osp.join('./tmp', osp.basename(osp.splitext(video_path)[0])) os.makedirs(target_dir, exist_ok=True) for frame_path, anno in zip(frame_paths, results_total): output_name = os.path.join(target_dir, os.path.basename(frame_path)) create_tree(frame_path) scale_ratio = np.array([w, h, w, h]) if anno is None: continue for ann in anno: box = ann box = (box * scale_ratio).astype(np.int64) label = "person" left, top, right, bottom = box.astype(float) create_object(annotation, label, left, top, right, bottom) tree = ET.ElementTree(annotation) root = tree.getroot() # 得到根元素,Element类 pretty_xml(root, '\t', '\n') # 执行美化方法 tree.write('%s.xml' % output_name.rstrip('.jpg'), encoding="utf-8")
def main(): args = parse_args() frame_paths, original_frames = frame_extraction(args.video) num_frame = len(frame_paths) h, w, _ = original_frames[0].shape # resize frames to shortside 256 new_w, new_h = mmcv.rescale_size((w, h), (256, np.Inf)) frames = [mmcv.imresize(img, (new_w, new_h)) for img in original_frames] w_ratio, h_ratio = new_w / w, new_h / h # Get clip_len, frame_interval and calculate center index of each clip config = mmcv.Config.fromfile(args.config) config.merge_from_dict(args.cfg_options) val_pipeline = config.data.val.pipeline sampler = [x for x in val_pipeline if x['type'] == 'SampleAVAFrames'][0] clip_len, frame_interval = sampler['clip_len'], sampler['frame_interval'] window_size = clip_len * frame_interval assert clip_len % 2 == 0, 'We would like to have an even clip_len' # Note that it's 1 based here timestamps = np.arange(window_size // 2, num_frame + 1 - window_size // 2, args.predict_stepsize) # Load label_map label_map = load_label_map(args.label_map) try: if config['data']['train']['custom_classes'] is not None: label_map = { id + 1: label_map[cls] for id, cls in enumerate(config['data']['train'] ['custom_classes']) } except KeyError: pass # Get Human detection results center_frames = [frame_paths[ind - 1] for ind in timestamps] human_detections = detection_inference(args, center_frames) for i in range(len(human_detections)): det = human_detections[i] det[:, 0:4:2] *= w_ratio det[:, 1:4:2] *= h_ratio human_detections[i] = torch.from_numpy(det[:, :4]).to(args.device) # Get img_norm_cfg img_norm_cfg = config['img_norm_cfg'] if 'to_rgb' not in img_norm_cfg and 'to_bgr' in img_norm_cfg: to_bgr = img_norm_cfg.pop('to_bgr') img_norm_cfg['to_rgb'] = to_bgr img_norm_cfg['mean'] = np.array(img_norm_cfg['mean']) img_norm_cfg['std'] = np.array(img_norm_cfg['std']) # Build STDET model try: # In our spatiotemporal detection demo, different actions should have # the same number of bboxes. config['model']['test_cfg']['rcnn']['action_thr'] = .0 except KeyError: pass config.model.backbone.pretrained = None model = build_detector(config.model, test_cfg=config.get('test_cfg')) load_checkpoint(model, args.checkpoint, map_location=args.device) model.to(args.device) model.eval() predictions = [] print('Performing SpatioTemporal Action Detection for each clip') assert len(timestamps) == len(human_detections) prog_bar = mmcv.ProgressBar(len(timestamps)) for timestamp, proposal in zip(timestamps, human_detections): if proposal.shape[0] == 0: predictions.append(None) continue start_frame = timestamp - (clip_len // 2 - 1) * frame_interval frame_inds = start_frame + np.arange(0, window_size, frame_interval) frame_inds = list(frame_inds - 1) imgs = [frames[ind].astype(np.float32) for ind in frame_inds] _ = [mmcv.imnormalize_(img, **img_norm_cfg) for img in imgs] # THWC -> CTHW -> 1CTHW input_array = np.stack(imgs).transpose((3, 0, 1, 2))[np.newaxis] input_tensor = torch.from_numpy(input_array).to(args.device) with torch.no_grad(): result = model(return_loss=False, img=[input_tensor], img_metas=[[dict(img_shape=(new_h, new_w))]], proposals=[[proposal]]) result = result[0] prediction = [] # N proposals for i in range(proposal.shape[0]): prediction.append([]) # Perform action score thr for i in range(len(result)): if i + 1 not in label_map: continue for j in range(proposal.shape[0]): if result[i][j, 4] > args.action_score_thr: prediction[j].append((label_map[i + 1], result[i][j, 4])) predictions.append(prediction) prog_bar.update() results = [] for human_detection, prediction in zip(human_detections, predictions): results.append(pack_result(human_detection, prediction, new_h, new_w)) def dense_timestamps(timestamps, n): """Make it nx frames.""" old_frame_interval = (timestamps[1] - timestamps[0]) start = timestamps[0] - old_frame_interval / n * (n - 1) / 2 new_frame_inds = np.arange( len(timestamps) * n) * old_frame_interval / n + start return new_frame_inds.astype(np.int) dense_n = int(args.predict_stepsize / args.output_stepsize) frames = [ cv2.imread(frame_paths[i - 1]) for i in dense_timestamps(timestamps, dense_n) ] print('Performing visualization') vis_frames = visualize(frames, results) vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames], fps=args.output_fps) vid.write_videofile(args.out_filename) tmp_frame_dir = osp.dirname(frame_paths[0]) shutil.rmtree(tmp_frame_dir)
def __init__(self, config, display_height=0, display_width=0, input_video=0, predict_stepsize=40, output_fps=25, clip_vis_length=8, out_filename=None, show=True, stdet_input_shortside=256): # stdet sampling strategy val_pipeline = config['val_pipeline'] sampler = [x for x in val_pipeline if x['type'] == 'SampleAVAFrames'][0] clip_len, frame_interval = sampler['clip_len'], sampler[ 'frame_interval'] self.window_size = clip_len * frame_interval # asserts assert (out_filename or show), \ 'out_filename and show cannot both be None' assert clip_len % 2 == 0, 'We would like to have an even clip_len' assert clip_vis_length <= predict_stepsize assert 0 < predict_stepsize <= self.window_size # source params try: self.cap = cv2.VideoCapture(int(input_video)) self.webcam = True except ValueError: self.cap = cv2.VideoCapture(input_video) self.webcam = False assert self.cap.isOpened() # stdet input preprocessing params h = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) w = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH)) self.stdet_input_size = mmcv.rescale_size( (w, h), (stdet_input_shortside, np.Inf)) img_norm_cfg = config['img_norm_cfg'] if 'to_rgb' not in img_norm_cfg and 'to_bgr' in img_norm_cfg: to_bgr = img_norm_cfg.pop('to_bgr') img_norm_cfg['to_rgb'] = to_bgr img_norm_cfg['mean'] = np.array(img_norm_cfg['mean']) img_norm_cfg['std'] = np.array(img_norm_cfg['std']) self.img_norm_cfg = img_norm_cfg # task init params self.clip_vis_length = clip_vis_length self.predict_stepsize = predict_stepsize self.buffer_size = self.window_size - self.predict_stepsize frame_start = self.window_size // 2 - (clip_len // 2) * frame_interval self.frames_inds = [ frame_start + frame_interval * i for i in range(clip_len) ] self.buffer = [] self.processed_buffer = [] # output/display params if display_height > 0 and display_width > 0: self.display_size = (display_width, display_height) elif display_height > 0 or display_width > 0: self.display_size = mmcv.rescale_size( (w, h), (np.Inf, max(display_height, display_width))) else: self.display_size = (w, h) self.ratio = tuple( n / o for n, o in zip(self.stdet_input_size, self.display_size)) if output_fps <= 0: self.output_fps = int(self.cap.get(cv2.CAP_PROP_FPS)) else: self.output_fps = output_fps self.show = show self.video_writer = None if out_filename is not None: self.video_writer = self.get_output_video_writer(out_filename) display_start_idx = self.window_size // 2 - self.predict_stepsize // 2 self.display_inds = [ display_start_idx + i for i in range(self.predict_stepsize) ] # display multi-theading params self.display_id = -1 # task.id for display queue self.display_queue = {} self.display_lock = threading.Lock() self.output_lock = threading.Lock() # read multi-theading params self.read_id = -1 # task.id for read queue self.read_id_lock = threading.Lock() self.read_queue = queue.Queue() self.read_lock = threading.Lock() self.not_end = True # cap.read() flag # program state self.stopped = False atexit.register(self.clean)
def main(): args = parse_args() # frame_paths, original_frames = frame_extraction(args.video) #folder path video_path = args.video frame_paths = sorted([osp.join(video_path, x) for x in os.listdir(video_path)]) num_frame = len(frame_paths) # h, w, _ = original_frames[0].shape frame = cv2.imread(frame_paths[0]) h, w, _ = frame.shape # Load label_map label_map = load_label_map(args.label_map) # resize frames to shortside 256 new_w, new_h = mmcv.rescale_size((w, h), (256, np.Inf)) # frames = [mmcv.imresize(img, (new_w, new_h)) for img in original_frames] w_ratio, h_ratio = new_w / w, new_h / h # Get clip_len, frame_interval and calculate center index of each clip config = mmcv.Config.fromfile(args.config) val_pipeline = config['val_pipeline'] sampler = [x for x in val_pipeline if x['type'] == 'SampleAVAFrames'][0] clip_len, frame_interval = sampler['clip_len'], sampler['frame_interval'] # if num_frame < clip_len * frame_interval: # frame_interval=max(int(num_frame/clip_len)-1,0) window_size = clip_len * frame_interval assert clip_len % 2 == 0, 'We would like to have an even clip_len' # Note that it's 1 based here timestamps = np.arange(window_size // 2, num_frame + 1 - window_size // 2, args.predict_stepsize) # Get Human detection results center_frames = [frame_paths[ind - 1] for ind in timestamps] human_detections = detection_inference(args, center_frames) for i in range(len(human_detections)): det = human_detections[i] det[:, 0:4:2] *= w_ratio det[:, 1:4:2] *= h_ratio human_detections[i] = torch.from_numpy(det[:, :4]).to(args.device) # Get img_norm_cfg img_norm_cfg = config['img_norm_cfg'] if 'to_rgb' not in img_norm_cfg and 'to_bgr' in img_norm_cfg: to_bgr = img_norm_cfg.pop('to_bgr') img_norm_cfg['to_rgb'] = to_bgr img_norm_cfg['mean'] = np.array(img_norm_cfg['mean']) img_norm_cfg['std'] = np.array(img_norm_cfg['std']) # Build STDET model config.model.backbone.pretrained = None model = build_detector(config.model, test_cfg=config.get('test_cfg')) load_checkpoint(model, args.checkpoint, map_location=args.device) model.to(args.device) model.eval() predictions = [] print('Performing SpatioTemporal Action Detection for each clip') for timestamp, proposal in tqdm(zip(timestamps, human_detections)): if proposal.shape[0] == 0: predictions.append(None) continue start_frame = timestamp - (clip_len // 2 - 1) * frame_interval frame_inds = start_frame + np.arange(0, window_size, frame_interval) frame_inds = list(frame_inds - 1) imgs = [mmcv.imresize(cv2.imread(frame_paths[ind]), (new_w, new_h)).astype(np.float32) for ind in frame_inds] # imgs = [frames[ind].astype(np.float32) for ind in frame_inds] _ = [mmcv.imnormalize_(img, **img_norm_cfg) for img in imgs] # THWC -> CTHW -> 1CTHW input_array = np.stack(imgs).transpose((3, 0, 1, 2))[np.newaxis] input_tensor = torch.from_numpy(input_array).to(args.device) with torch.no_grad(): result = model( return_loss=False, img=[input_tensor], img_metas=[[dict(img_shape=(new_h, new_w))]], proposals=[[proposal]]) result = result[0] prediction = [] # N proposals for i in range(proposal.shape[0]): prediction.append([]) # Perform action score thr for i in range(len(result)): if i + 1 not in label_map: continue for j in range(proposal.shape[0]): if result[i][j, 4] > args.action_score_thr: prediction[j].append((label_map[i + 1], result[i][j, 4])) predictions.append(prediction) results = [] for human_detection, prediction in zip(human_detections, predictions): results.append(pack_result(human_detection, prediction, new_h, new_w)) def dense_timestamps(timestamps, n): """Make it nx frames.""" old_frame_interval = (timestamps[1] - timestamps[0]) start = timestamps[0] - old_frame_interval / n * (n - 1) / 2 new_frame_inds = np.arange( len(timestamps) * n) * old_frame_interval / n + start return new_frame_inds.astype(np.int) dense_n = int(args.predict_stepsize / args.output_stepsize) frames = [ cv2.imread(frame_paths[i - 1]) for i in dense_timestamps(timestamps, dense_n) ] print('Performing visualization') vis_frames = visualize(frames, results) vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames], fps=args.output_fps) vid.write_videofile(args.out_filename) #save image target_dir = osp.join('./tmp/test') os.makedirs(target_dir, exist_ok=True) frame_tmpl = osp.join(target_dir, 'img_%06d.jpg') vid.write_images_sequence(frame_tmpl,fps=args.output_fps)
def main(): args = parse_args() frame_paths, original_frames = frame_extraction(args.video) num_frame = len(frame_paths) h, w, _ = original_frames[0].shape # Get Human detection results and pose results human_detections = detection_inference(args, frame_paths) pose_results = None if args.use_skeleton_recog or args.use_skeleton_stdet: pose_results = pose_inference(args, frame_paths, human_detections) # resize frames to shortside 256 new_w, new_h = mmcv.rescale_size((w, h), (256, np.Inf)) frames = [mmcv.imresize(img, (new_w, new_h)) for img in original_frames] w_ratio, h_ratio = new_w / w, new_h / h # Load spatio-temporal detection label_map stdet_label_map = load_label_map(args.label_map_stdet) rgb_stdet_config = mmcv.Config.fromfile(args.rgb_stdet_config) rgb_stdet_config.merge_from_dict(args.cfg_options) try: if rgb_stdet_config['data']['train']['custom_classes'] is not None: stdet_label_map = { id + 1: stdet_label_map[cls] for id, cls in enumerate(rgb_stdet_config['data']['train'] ['custom_classes']) } except KeyError: pass action_result = None if args.use_skeleton_recog: print('Use skeleton-based recognition') action_result = skeleton_based_action_recognition( args, pose_results, num_frame, h, w) else: print('Use rgb-based recognition') action_result = rgb_based_action_recognition(args) stdet_preds = None if args.use_skeleton_stdet: print('Use skeleton-based SpatioTemporal Action Detection') clip_len, frame_interval = 30, 1 timestamps, stdet_preds = skeleton_based_stdet(args, stdet_label_map, human_detections, pose_results, num_frame, clip_len, frame_interval, h, w) for i in range(len(human_detections)): det = human_detections[i] det[:, 0:4:2] *= w_ratio det[:, 1:4:2] *= h_ratio human_detections[i] = torch.from_numpy(det[:, :4]).to(args.device) else: print('Use rgb-based SpatioTemporal Action Detection') for i in range(len(human_detections)): det = human_detections[i] det[:, 0:4:2] *= w_ratio det[:, 1:4:2] *= h_ratio human_detections[i] = torch.from_numpy(det[:, :4]).to(args.device) timestamps, stdet_preds = rgb_based_stdet(args, frames, stdet_label_map, human_detections, w, h, new_w, new_h, w_ratio, h_ratio) stdet_results = [] for timestamp, prediction in zip(timestamps, stdet_preds): human_detection = human_detections[timestamp - 1] stdet_results.append( pack_result(human_detection, prediction, new_h, new_w)) def dense_timestamps(timestamps, n): """Make it nx frames.""" old_frame_interval = (timestamps[1] - timestamps[0]) start = timestamps[0] - old_frame_interval / n * (n - 1) / 2 new_frame_inds = np.arange( len(timestamps) * n) * old_frame_interval / n + start return new_frame_inds.astype(np.int) dense_n = int(args.predict_stepsize / args.output_stepsize) output_timestamps = dense_timestamps(timestamps, dense_n) frames = [ cv2.imread(frame_paths[timestamp - 1]) for timestamp in output_timestamps ] print('Performing visualization') pose_model = init_pose_model(args.pose_config, args.pose_checkpoint, args.device) if args.use_skeleton_recog or args.use_skeleton_stdet: pose_results = [ pose_results[timestamp - 1] for timestamp in output_timestamps ] vis_frames = visualize(frames, stdet_results, pose_results, action_result, pose_model) vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames], fps=args.output_fps) vid.write_videofile(args.out_filename) tmp_frame_dir = osp.dirname(frame_paths[0]) shutil.rmtree(tmp_frame_dir)