def test_imnormalize_(self): img_for_normalize = np.float32(self.img) rgb_img_for_normalize = np.float32(self.img[:, :, ::-1]) baseline = (rgb_img_for_normalize - self.mean) / self.std img = mmcv.imnormalize_(img_for_normalize, self.mean, self.std) assert np.allclose(img_for_normalize, baseline) assert id(img) == id(img_for_normalize) img = mmcv.imnormalize_( rgb_img_for_normalize, self.mean, self.std, to_rgb=False) assert np.allclose(img, baseline) assert id(img) == id(rgb_img_for_normalize)
def __call__(self, results): modality = results['modality'] if modality == 'RGB': n = len(results['imgs']) h, w, c = results['imgs'][0].shape imgs = np.empty((n, h, w, c), dtype=np.float32) for i, img in enumerate(results['imgs']): imgs[i] = img for img in imgs: mmcv.imnormalize_(img, self.mean, self.std, self.to_bgr) results['imgs'] = imgs results['img_norm_cfg'] = dict(mean=self.mean, std=self.std, to_bgr=self.to_bgr) return results elif modality == 'Flow': num_imgs = len(results['imgs']) assert num_imgs % 2 == 0 assert self.mean.shape[0] == 2 assert self.std.shape[0] == 2 n = num_imgs // 2 h, w = results['imgs'][0].shape x_flow = np.empty((n, h, w), dtype=np.float32) y_flow = np.empty((n, h, w), dtype=np.float32) for i in range(n): x_flow[i] = results['imgs'][2 * i] y_flow[i] = results['imgs'][2 * i + 1] x_flow = (x_flow - self.mean[0]) / self.std[0] y_flow = (y_flow - self.mean[1]) / self.std[1] if self.adjust_magnitude: x_flow = x_flow * results['scale_factor'][0] y_flow = y_flow * results['scale_factor'][1] imgs = np.stack([x_flow, y_flow], axis=-1) results['imgs'] = imgs args = dict(mean=self.mean, std=self.std, to_bgr=self.to_bgr, adjust_magnitude=self.adjust_magnitude) results['img_norm_cfg'] = args return results else: raise NotImplementedError
def __call__(self, frames, proposals): frame_w, frame_h = frames[0].shape[1], frames[0].shape[0] new_w, new_h = mmcv.rescale_size((frame_w, frame_h), (256, np.Inf)) w_ratio, h_ratio = new_w / frame_w, new_h / frame_h frames = [mmcv.imresize(img, (new_w, new_h)) for img in frames] _ = [mmcv.imnormalize_(frame, **self.img_norm_cfg) for frame in frames] # THWC -> CTHW -> 1CTHW input_array = np.stack(frames).transpose((3, 0, 1, 2))[np.newaxis] input_tensor = torch.from_numpy(input_array).to(self.device) proposal = proposals[len(proposals) // 2] proposal = torch.from_numpy(proposal[:, :4]).to(self.device) if proposal.shape[0] == 0: return None proposal[:, 0:4:2] *= w_ratio proposal[:, 1:4:2] *= h_ratio with torch.no_grad(): result = self.model(return_loss=False, img=[input_tensor], img_metas=[[dict(img_shape=(new_h, new_w))]], proposals=[[proposal]]) return self.post_proce(result, proposal)
def main(): args = parse_args() frame_paths, original_frames = frame_extraction(args.video) num_frame = len(frame_paths) h, w, _ = original_frames[0].shape # resize frames to shortside 256 new_w, new_h = mmcv.rescale_size((w, h), (256, np.Inf)) frames = [mmcv.imresize(img, (new_w, new_h)) for img in original_frames] w_ratio, h_ratio = new_w / w, new_h / h # Get clip_len, frame_interval and calculate center index of each clip config = mmcv.Config.fromfile(args.config) config.merge_from_dict(args.cfg_options) val_pipeline = config.data.val.pipeline sampler = [x for x in val_pipeline if x['type'] == 'SampleAVAFrames'][0] clip_len, frame_interval = sampler['clip_len'], sampler['frame_interval'] window_size = clip_len * frame_interval assert clip_len % 2 == 0, 'We would like to have an even clip_len' # Note that it's 1 based here timestamps = np.arange(window_size // 2, num_frame + 1 - window_size // 2, args.predict_stepsize) # Load label_map label_map = load_label_map(args.label_map) try: if config['data']['train']['custom_classes'] is not None: label_map = { id + 1: label_map[cls] for id, cls in enumerate(config['data']['train'] ['custom_classes']) } except KeyError: pass # Get Human detection results center_frames = [frame_paths[ind - 1] for ind in timestamps] human_detections = detection_inference(args, center_frames) for i in range(len(human_detections)): det = human_detections[i] det[:, 0:4:2] *= w_ratio det[:, 1:4:2] *= h_ratio human_detections[i] = torch.from_numpy(det[:, :4]).to(args.device) # Get img_norm_cfg img_norm_cfg = config['img_norm_cfg'] if 'to_rgb' not in img_norm_cfg and 'to_bgr' in img_norm_cfg: to_bgr = img_norm_cfg.pop('to_bgr') img_norm_cfg['to_rgb'] = to_bgr img_norm_cfg['mean'] = np.array(img_norm_cfg['mean']) img_norm_cfg['std'] = np.array(img_norm_cfg['std']) # Build STDET model try: # In our spatiotemporal detection demo, different actions should have # the same number of bboxes. config['model']['test_cfg']['rcnn']['action_thr'] = .0 except KeyError: pass config.model.backbone.pretrained = None model = build_detector(config.model, test_cfg=config.get('test_cfg')) load_checkpoint(model, args.checkpoint, map_location=args.device) model.to(args.device) model.eval() predictions = [] print('Performing SpatioTemporal Action Detection for each clip') assert len(timestamps) == len(human_detections) prog_bar = mmcv.ProgressBar(len(timestamps)) for timestamp, proposal in zip(timestamps, human_detections): if proposal.shape[0] == 0: predictions.append(None) continue start_frame = timestamp - (clip_len // 2 - 1) * frame_interval frame_inds = start_frame + np.arange(0, window_size, frame_interval) frame_inds = list(frame_inds - 1) imgs = [frames[ind].astype(np.float32) for ind in frame_inds] _ = [mmcv.imnormalize_(img, **img_norm_cfg) for img in imgs] # THWC -> CTHW -> 1CTHW input_array = np.stack(imgs).transpose((3, 0, 1, 2))[np.newaxis] input_tensor = torch.from_numpy(input_array).to(args.device) with torch.no_grad(): result = model(return_loss=False, img=[input_tensor], img_metas=[[dict(img_shape=(new_h, new_w))]], proposals=[[proposal]]) result = result[0] prediction = [] # N proposals for i in range(proposal.shape[0]): prediction.append([]) # Perform action score thr for i in range(len(result)): if i + 1 not in label_map: continue for j in range(proposal.shape[0]): if result[i][j, 4] > args.action_score_thr: prediction[j].append((label_map[i + 1], result[i][j, 4])) predictions.append(prediction) prog_bar.update() results = [] for human_detection, prediction in zip(human_detections, predictions): results.append(pack_result(human_detection, prediction, new_h, new_w)) def dense_timestamps(timestamps, n): """Make it nx frames.""" old_frame_interval = (timestamps[1] - timestamps[0]) start = timestamps[0] - old_frame_interval / n * (n - 1) / 2 new_frame_inds = np.arange( len(timestamps) * n) * old_frame_interval / n + start return new_frame_inds.astype(np.int) dense_n = int(args.predict_stepsize / args.output_stepsize) frames = [ cv2.imread(frame_paths[i - 1]) for i in dense_timestamps(timestamps, dense_n) ] print('Performing visualization') vis_frames = visualize(frames, results) vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames], fps=args.output_fps) vid.write_videofile(args.out_filename) tmp_frame_dir = osp.dirname(frame_paths[0]) shutil.rmtree(tmp_frame_dir)
def read_fn(self): """Main function for read thread. Contains three steps: 1) Read and preprocess (resize + norm) frames from source. 2) Create task by frames from previous step and buffer. 3) Put task into read queue. """ was_read = True start_time = time.time() while was_read and not self.stopped: # init task task = TaskInfo() task.clip_vis_length = self.clip_vis_length task.frames_inds = self.frames_inds task.ratio = self.ratio # read buffer frames = [] processed_frames = [] if len(self.buffer) != 0: frames = self.buffer if len(self.processed_buffer) != 0: processed_frames = self.processed_buffer # read and preprocess frames from source and update task with self.read_lock: before_read = time.time() read_frame_cnt = self.window_size - len(frames) while was_read and len(frames) < self.window_size: was_read, frame = self.cap.read() if not self.webcam: # Reading frames too fast may lead to unexpected # performance degradation. If you have enough # resource, this line could be commented. time.sleep(1 / self.output_fps) if was_read: frames.append(mmcv.imresize(frame, self.display_size)) processed_frame = mmcv.imresize( frame, self.stdet_input_size).astype(np.float32) _ = mmcv.imnormalize_(processed_frame, **self.img_norm_cfg) processed_frames.append(processed_frame) task.add_frames(self.read_id + 1, frames, processed_frames) # update buffer if was_read: self.buffer = frames[-self.buffer_size:] self.processed_buffer = processed_frames[-self.buffer_size:] # update read state with self.read_id_lock: self.read_id += 1 self.not_end = was_read self.read_queue.put((was_read, copy.deepcopy(task))) cur_time = time.time() logger.debug( f'Read thread: {1000*(cur_time - start_time):.0f} ms, ' f'{read_frame_cnt / (cur_time - before_read):.0f} fps') start_time = cur_time
def main(): args = parse_args() # frame_paths, original_frames = frame_extraction(args.video) #folder path video_path = args.video frame_paths = sorted([osp.join(video_path, x) for x in os.listdir(video_path)]) num_frame = len(frame_paths) # h, w, _ = original_frames[0].shape frame = cv2.imread(frame_paths[0]) h, w, _ = frame.shape # Load label_map label_map = load_label_map(args.label_map) # resize frames to shortside 256 new_w, new_h = mmcv.rescale_size((w, h), (256, np.Inf)) # frames = [mmcv.imresize(img, (new_w, new_h)) for img in original_frames] w_ratio, h_ratio = new_w / w, new_h / h # Get clip_len, frame_interval and calculate center index of each clip config = mmcv.Config.fromfile(args.config) val_pipeline = config['val_pipeline'] sampler = [x for x in val_pipeline if x['type'] == 'SampleAVAFrames'][0] clip_len, frame_interval = sampler['clip_len'], sampler['frame_interval'] # if num_frame < clip_len * frame_interval: # frame_interval=max(int(num_frame/clip_len)-1,0) window_size = clip_len * frame_interval assert clip_len % 2 == 0, 'We would like to have an even clip_len' # Note that it's 1 based here timestamps = np.arange(window_size // 2, num_frame + 1 - window_size // 2, args.predict_stepsize) # Get Human detection results center_frames = [frame_paths[ind - 1] for ind in timestamps] human_detections = detection_inference(args, center_frames) for i in range(len(human_detections)): det = human_detections[i] det[:, 0:4:2] *= w_ratio det[:, 1:4:2] *= h_ratio human_detections[i] = torch.from_numpy(det[:, :4]).to(args.device) # Get img_norm_cfg img_norm_cfg = config['img_norm_cfg'] if 'to_rgb' not in img_norm_cfg and 'to_bgr' in img_norm_cfg: to_bgr = img_norm_cfg.pop('to_bgr') img_norm_cfg['to_rgb'] = to_bgr img_norm_cfg['mean'] = np.array(img_norm_cfg['mean']) img_norm_cfg['std'] = np.array(img_norm_cfg['std']) # Build STDET model config.model.backbone.pretrained = None model = build_detector(config.model, test_cfg=config.get('test_cfg')) load_checkpoint(model, args.checkpoint, map_location=args.device) model.to(args.device) model.eval() predictions = [] print('Performing SpatioTemporal Action Detection for each clip') for timestamp, proposal in tqdm(zip(timestamps, human_detections)): if proposal.shape[0] == 0: predictions.append(None) continue start_frame = timestamp - (clip_len // 2 - 1) * frame_interval frame_inds = start_frame + np.arange(0, window_size, frame_interval) frame_inds = list(frame_inds - 1) imgs = [mmcv.imresize(cv2.imread(frame_paths[ind]), (new_w, new_h)).astype(np.float32) for ind in frame_inds] # imgs = [frames[ind].astype(np.float32) for ind in frame_inds] _ = [mmcv.imnormalize_(img, **img_norm_cfg) for img in imgs] # THWC -> CTHW -> 1CTHW input_array = np.stack(imgs).transpose((3, 0, 1, 2))[np.newaxis] input_tensor = torch.from_numpy(input_array).to(args.device) with torch.no_grad(): result = model( return_loss=False, img=[input_tensor], img_metas=[[dict(img_shape=(new_h, new_w))]], proposals=[[proposal]]) result = result[0] prediction = [] # N proposals for i in range(proposal.shape[0]): prediction.append([]) # Perform action score thr for i in range(len(result)): if i + 1 not in label_map: continue for j in range(proposal.shape[0]): if result[i][j, 4] > args.action_score_thr: prediction[j].append((label_map[i + 1], result[i][j, 4])) predictions.append(prediction) results = [] for human_detection, prediction in zip(human_detections, predictions): results.append(pack_result(human_detection, prediction, new_h, new_w)) def dense_timestamps(timestamps, n): """Make it nx frames.""" old_frame_interval = (timestamps[1] - timestamps[0]) start = timestamps[0] - old_frame_interval / n * (n - 1) / 2 new_frame_inds = np.arange( len(timestamps) * n) * old_frame_interval / n + start return new_frame_inds.astype(np.int) dense_n = int(args.predict_stepsize / args.output_stepsize) frames = [ cv2.imread(frame_paths[i - 1]) for i in dense_timestamps(timestamps, dense_n) ] print('Performing visualization') vis_frames = visualize(frames, results) vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames], fps=args.output_fps) vid.write_videofile(args.out_filename) #save image target_dir = osp.join('./tmp/test') os.makedirs(target_dir, exist_ok=True) frame_tmpl = osp.join(target_dir, 'img_%06d.jpg') vid.write_images_sequence(frame_tmpl,fps=args.output_fps)
def rgb_based_stdet(args, frames, label_map, human_detections, w, h, new_w, new_h, w_ratio, h_ratio): rgb_stdet_config = mmcv.Config.fromfile(args.rgb_stdet_config) rgb_stdet_config.merge_from_dict(args.cfg_options) val_pipeline = rgb_stdet_config.data.val.pipeline sampler = [x for x in val_pipeline if x['type'] == 'SampleAVAFrames'][0] clip_len, frame_interval = sampler['clip_len'], sampler['frame_interval'] assert clip_len % 2 == 0, 'We would like to have an even clip_len' window_size = clip_len * frame_interval num_frame = len(frames) timestamps = np.arange(window_size // 2, num_frame + 1 - window_size // 2, args.predict_stepsize) # Get img_norm_cfg img_norm_cfg = rgb_stdet_config['img_norm_cfg'] if 'to_rgb' not in img_norm_cfg and 'to_bgr' in img_norm_cfg: to_bgr = img_norm_cfg.pop('to_bgr') img_norm_cfg['to_rgb'] = to_bgr img_norm_cfg['mean'] = np.array(img_norm_cfg['mean']) img_norm_cfg['std'] = np.array(img_norm_cfg['std']) # Build STDET model try: # In our spatiotemporal detection demo, different actions should have # the same number of bboxes. rgb_stdet_config['model']['test_cfg']['rcnn']['action_thr'] = .0 except KeyError: pass rgb_stdet_config.model.backbone.pretrained = None rgb_stdet_model = build_detector(rgb_stdet_config.model, test_cfg=rgb_stdet_config.get('test_cfg')) load_checkpoint(rgb_stdet_model, args.rgb_stdet_checkpoint, map_location='cpu') rgb_stdet_model.to(args.device) rgb_stdet_model.eval() predictions = [] print('Performing SpatioTemporal Action Detection for each clip') prog_bar = mmcv.ProgressBar(len(timestamps)) for timestamp in timestamps: proposal = human_detections[timestamp - 1] if proposal.shape[0] == 0: predictions.append(None) continue start_frame = timestamp - (clip_len // 2 - 1) * frame_interval frame_inds = start_frame + np.arange(0, window_size, frame_interval) frame_inds = list(frame_inds - 1) imgs = [frames[ind].astype(np.float32) for ind in frame_inds] _ = [mmcv.imnormalize_(img, **img_norm_cfg) for img in imgs] # THWC -> CTHW -> 1CTHW input_array = np.stack(imgs).transpose((3, 0, 1, 2))[np.newaxis] input_tensor = torch.from_numpy(input_array).to(args.device) with torch.no_grad(): result = rgb_stdet_model( return_loss=False, img=[input_tensor], img_metas=[[dict(img_shape=(new_h, new_w))]], proposals=[[proposal]]) result = result[0] prediction = [] # N proposals for i in range(proposal.shape[0]): prediction.append([]) # Perform action score thr for i in range(len(result)): # 80 if i + 1 not in label_map: continue for j in range(proposal.shape[0]): if result[i][j, 4] > args.action_score_thr: prediction[j].append((label_map[i + 1], result[i][j, 4])) predictions.append(prediction) prog_bar.update() return timestamps, predictions