def run_demo(cfg, frame_provider): """ Run demo visualization. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py frame_provider (iterator): Python iterator that return task objects that are filled with necessary information such as `frames`, `id` and `num_buffer_frames` for the prediction and visualization pipeline. """ # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging(cfg.OUTPUT_DIR) # Print config. logger.info("Run demo with config:") logger.info(cfg) assert cfg.NUM_GPUS <= 1, "Cannot run demo on multiple GPUs." # Print config. logger.info("Run demo with config:") logger.info(cfg) video_vis = VideoVisualizer( cfg.MODEL.NUM_CLASSES, cfg.DEMO.LABEL_FILE_PATH, cfg.TENSORBOARD.MODEL_VIS.TOPK_PREDS, cfg.TENSORBOARD.MODEL_VIS.COLORMAP, ) if cfg.DETECTION.ENABLE: object_detector = Detectron2Predictor(cfg) model = ActionPredictor(cfg) seq_len = cfg.DATA.NUM_FRAMES * cfg.DATA.SAMPLING_RATE assert (cfg.DEMO.BUFFER_SIZE <= seq_len // 2), "Buffer size cannot be greater than half of sequence length." init_task_info( frame_provider.display_height, frame_provider.display_width, cfg.DATA.TEST_CROP_SIZE, cfg.DEMO.CLIP_VIS_SIZE, ) for able_to_read, task in frame_provider: if not able_to_read: break if cfg.DETECTION.ENABLE: task = object_detector(task) task = model(task) frames = draw_predictions(task, video_vis) # hit Esc to quit the demo. key = cv2.waitKey(1) if key == 27: break yield frames
def __init__(self, cfg): """ Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ self.cfg = cfg self.class_names, _, self.subset = get_class_names( cfg.TENSORBOARD.CLASS_NAMES_PATH, subset_path=cfg.TENSORBOARD.WRONG_PRED_VIS.SUBSET_PATH, ) if self.subset is not None: self.subset = set(self.subset) self.num_class = cfg.MODEL.NUM_CLASSES self.video_vis = VideoVisualizer( cfg.MODEL.NUM_CLASSES, cfg.TENSORBOARD.CLASS_NAMES_PATH, 1, cfg.TENSORBOARD.MODEL_VIS.COLORMAP, ) self.tag = cfg.TENSORBOARD.WRONG_PRED_VIS.TAG self.writer = tb.TensorboardWriter(cfg) self.model_incorrect_classes = set()
def run_visualization(vis_loader, model, cfg, writer=None): """ Run model visualization (weights, activations and model inputs) and visualize them on Tensorboard. Args: vis_loader (loader): video visualization loader. model (model): the video model to visualize. cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py writer (TensorboardWriter, optional): TensorboardWriter object to writer Tensorboard log. """ n_devices = cfg.NUM_GPUS * cfg.NUM_SHARDS prefix = "module/" if n_devices > 1 else "" # Get a list of selected layer names and indexing. layer_ls, indexing_dict = process_layer_index_data( cfg.TENSORBOARD.MODEL_VIS.LAYER_LIST, layer_name_prefix=prefix) logger.info("Start Model Visualization.") # Register hooks for activations. model_vis = GetWeightAndActivation(model, layer_ls) if writer is not None and cfg.TENSORBOARD.MODEL_VIS.MODEL_WEIGHTS: layer_weights = model_vis.get_weights() writer.plot_weights_and_activations(layer_weights, tag="Layer Weights/", heat_map=False) video_vis = VideoVisualizer( cfg.MODEL.NUM_CLASSES, cfg.TENSORBOARD.CLASS_NAMES_PATH, cfg.TENSORBOARD.MODEL_VIS.TOPK_PREDS, cfg.TENSORBOARD.MODEL_VIS.COLORMAP, ) if n_devices > 1: grad_cam_layer_ls = [ "module/" + layer for layer in cfg.TENSORBOARD.MODEL_VIS.GRAD_CAM.LAYER_LIST ] else: grad_cam_layer_ls = cfg.TENSORBOARD.MODEL_VIS.GRAD_CAM.LAYER_LIST if cfg.TENSORBOARD.MODEL_VIS.GRAD_CAM.ENABLE: gradcam = GradCAM( model, target_layers=grad_cam_layer_ls, data_mean=cfg.DATA.MEAN, data_std=cfg.DATA.STD, colormap=cfg.TENSORBOARD.MODEL_VIS.GRAD_CAM.COLORMAP, ) logger.info("Finish drawing weights.") global_idx = -1 for inputs, labels, _, meta in tqdm.tqdm(vis_loader): if cfg.NUM_GPUS: # Transfer the data to the current GPU device. if isinstance(inputs, (list, )): for i in range(len(inputs)): inputs[i] = inputs[i].cuda(non_blocking=True) else: inputs = inputs.cuda(non_blocking=True) labels = labels.cuda() for key, val in meta.items(): if isinstance(val, (list, )): for i in range(len(val)): val[i] = val[i].cuda(non_blocking=True) else: meta[key] = val.cuda(non_blocking=True) if cfg.DETECTION.ENABLE: activations, preds = model_vis.get_activations( inputs, meta["boxes"]) else: activations, preds = model_vis.get_activations(inputs) if cfg.TENSORBOARD.MODEL_VIS.GRAD_CAM.ENABLE: if cfg.TENSORBOARD.MODEL_VIS.GRAD_CAM.USE_TRUE_LABEL: inputs, preds = gradcam(inputs, labels=labels) else: inputs, preds = gradcam(inputs) if cfg.NUM_GPUS: inputs = du.all_gather_unaligned(inputs) activations = du.all_gather_unaligned(activations) preds = du.all_gather_unaligned(preds) if isinstance(inputs[0], list): for i in range(len(inputs)): for j in range(len(inputs[0])): inputs[i][j] = inputs[i][j].cpu() else: inputs = [inp.cpu() for inp in inputs] preds = [pred.cpu() for pred in preds] else: inputs, activations, preds = [inputs], [activations], [preds] boxes = [None] * max(n_devices, 1) if cfg.DETECTION.ENABLE and cfg.NUM_GPUS: boxes = du.all_gather_unaligned(meta["boxes"]) boxes = [box.cpu() for box in boxes] if writer is not None: total_vids = 0 for i in range(max(n_devices, 1)): cur_input = inputs[i] cur_activations = activations[i] cur_batch_size = cur_input[0].shape[0] cur_preds = preds[i] cur_boxes = boxes[i] for cur_batch_idx in range(cur_batch_size): global_idx += 1 total_vids += 1 if (cfg.TENSORBOARD.MODEL_VIS.INPUT_VIDEO or cfg.TENSORBOARD.MODEL_VIS.GRAD_CAM.ENABLE): for path_idx, input_pathway in enumerate(cur_input): if cfg.TEST.DATASET == "ava" and cfg.AVA.BGR: video = input_pathway[cur_batch_idx, [2, 1, 0], ...] else: video = input_pathway[cur_batch_idx] if not cfg.TENSORBOARD.MODEL_VIS.GRAD_CAM.ENABLE: # Permute to (T, H, W, C) from (C, T, H, W). video = video.permute(1, 2, 3, 0) video = data_utils.revert_tensor_normalize( video, cfg.DATA.MEAN, cfg.DATA.STD) else: # Permute from (T, C, H, W) to (T, H, W, C) video = video.permute(0, 2, 3, 1) bboxes = (None if cur_boxes is None else cur_boxes[:, 1:]) cur_prediction = (cur_preds if cfg.DETECTION.ENABLE else cur_preds[cur_batch_idx]) video = video_vis.draw_clip(video, cur_prediction, bboxes=bboxes) video = (torch.from_numpy(np.array(video)).permute( 0, 3, 1, 2).unsqueeze(0)) writer.add_video( video, tag="Input {}/Pathway {}".format( global_idx, path_idx + 1), ) if cfg.TENSORBOARD.MODEL_VIS.ACTIVATIONS: writer.plot_weights_and_activations( cur_activations, tag="Input {}/Activations: ".format(global_idx), batch_idx=cur_batch_idx, indexing_dict=indexing_dict, )
def run_demo(cfg, frame_provider): """ Run demo visualization. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py frame_provider (iterator): Python iterator that return task objects that are filled with necessary information such as `frames`, `id` and `num_buffer_frames` for the prediction and visualization pipeline. """ # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. loggings.setup_logging(cfg.OUTPUT_DIR) # Print config. logger.info("Run demo with config:") logger.info(cfg) common_classes = (cfg.DEMO.COMMON_CLASS_NAMES if len(cfg.DEMO.LABEL_FILE_PATH) != 0 else None) ## draw box video_vis = VideoVisualizer( num_classes=cfg.MODEL.NUM_CLASSES, class_names_path=cfg.DEMO.LABEL_FILE_PATH, top_k=cfg.TENSORBOARD.MODEL_VIS.TOPK_PREDS, thres=cfg.DEMO.COMMON_CLASS_THRES, lower_thres=cfg.DEMO.UNCOMMON_CLASS_THRES, common_class_names=common_classes, colormap=cfg.TENSORBOARD.MODEL_VIS.COLORMAP, mode=cfg.DEMO.VIS_MODE, ) async_vis = AsyncVis(video_vis, n_workers=cfg.DEMO.NUM_VIS_INSTANCES) if cfg.NUM_GPUS <= 1: model = ActionPredictor(cfg=cfg, async_vis=async_vis) ## 实例化动作检测类 else: model = AsyncDemo(cfg=cfg, async_vis=async_vis) seq_len = cfg.DATA.NUM_FRAMES * cfg.DATA.SAMPLING_RATE assert (cfg.DEMO.BUFFER_SIZE <= seq_len // 2), "Buffer size cannot be greater than half of sequence length." num_task = 0 # Start reading frames. frame_provider.start() for able_to_read, task in frame_provider: if not able_to_read: break if task is None: time.sleep(0.02) continue num_task += 1 ## Start detction and recogintiaon task model.put(task) try: task = model.get() num_task -= 1 yield task except IndexError: continue while num_task != 0: try: task = model.get() num_task -= 1 yield task except IndexError: continue
def draw_video(self): """ Draw predicted and ground-truth (if provided) results on the video/folder of images. Write the visualized result to a video output file. """ all_boxes = merge_pred_gt_boxes(self.pred_boxes, self.gt_boxes) common_classes = (self.cfg.DEMO.COMMON_CLASS_NAMES if len(self.cfg.DEMO.LABEL_FILE_PATH) != 0 else None) video_vis = VideoVisualizer( num_classes=self.cfg.MODEL.NUM_CLASSES, class_names_path=self.cfg.DEMO.LABEL_FILE_PATH, top_k=self.cfg.TENSORBOARD.MODEL_VIS.TOPK_PREDS, thres=self.cfg.DEMO.COMMON_CLASS_THRES, lower_thres=self.cfg.DEMO.UNCOMMON_CLASS_THRES, common_class_names=common_classes, colormap=self.cfg.TENSORBOARD.MODEL_VIS.COLORMAP, mode=self.cfg.DEMO.VIS_MODE, ) all_keys = sorted(all_boxes.keys()) # Draw around the keyframe for 2/10 of the sequence length. # This is chosen using heuristics. draw_range = [ self.seq_length // 2 - self.seq_length // 10, self.seq_length // 2 + self.seq_length // 10, ] draw_range_repeat = [ draw_range[0], (draw_range[1] - draw_range[0]) * self.no_frames_repeat + draw_range[0], ] prev_buffer = [] prev_end_idx = 0 logger.info("Start Visualization...") for keyframe_idx in tqdm.tqdm(all_keys): pred_gt_boxes = all_boxes[keyframe_idx] # Find the starting index of the clip. If start_idx exceeds the beginning # of the video, we only choose valid frame from index 0. start_idx = max(0, keyframe_idx - self.seq_length // 2) # Number of frames from the start of the current clip and the # end of the previous clip. dist = start_idx - prev_end_idx # If there are unwritten frames in between clips. if dist >= 0: # Get the frames in between previous clip and current clip. frames = self._get_frame_range(prev_end_idx, dist) # We keep a buffer of frames for overlapping visualization. # Write these to the output file. for frame in prev_buffer: frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) self.display(frame) # Write them to output file without any visualization # since they don't have any corresponding keyframes. for frame in frames: self.display(frame) prev_buffer = [] num_new_frames = self.seq_length # If there are overlapping frames in between clips. elif dist < 0: # Flush all ready frames. for frame in prev_buffer[:dist]: frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) self.display(frame) prev_buffer = prev_buffer[dist:] num_new_frames = self.seq_length + dist # Obtain new frames for the current clip from the input video file. new_frames = self._get_frame_range(max(start_idx, prev_end_idx), num_new_frames) new_frames = [ cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) for frame in new_frames ] clip = prev_buffer + new_frames # Calculate the end of this clip. This will be `prev_end_idx` for the # next iteration. prev_end_idx = max(start_idx, prev_end_idx) + len(new_frames) # For each precomputed or gt boxes. for i, boxes in enumerate(pred_gt_boxes): if i == 0: repeat = self.no_frames_repeat current_draw_range = draw_range else: repeat = 1 current_draw_range = draw_range_repeat # Make sure draw range does not fall out of end of clip. current_draw_range[1] = min(current_draw_range[1], len(clip) - 1) ground_truth = boxes[0] bboxes = boxes[1] label = boxes[2] # Draw predictions. clip = video_vis.draw_clip_range( clip, label, bboxes=torch.Tensor(bboxes), ground_truth=ground_truth, draw_range=current_draw_range, repeat_frame=repeat, ) # Store the current clip as buffer. prev_buffer = clip # Write the remaining buffer to output file. for frame in prev_buffer: frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) self.display(frame) # If we still have some remaining frames in the input file, # write those to the output file as well. if prev_end_idx < self.total_frames: dist = self.total_frames - prev_end_idx remaining_clip = self._get_frame_range(prev_end_idx, dist) for frame in remaining_clip: self.display(frame)
class WrongPredictionVis: """ WrongPredictionVis class for visualizing video inputs to Tensorboard for instances that the model makes wrong predictions. """ def __init__(self, cfg): """ Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ self.cfg = cfg self.class_names, _, self.subset = get_class_names( cfg.TENSORBOARD.CLASS_NAMES_PATH, subset_path=cfg.TENSORBOARD.WRONG_PRED_VIS.SUBSET_PATH, ) if self.subset is not None: self.subset = set(self.subset) self.num_class = cfg.MODEL.NUM_CLASSES self.video_vis = VideoVisualizer( cfg.MODEL.NUM_CLASSES, cfg.TENSORBOARD.CLASS_NAMES_PATH, 1, cfg.TENSORBOARD.MODEL_VIS.COLORMAP, ) self.tag = cfg.TENSORBOARD.WRONG_PRED_VIS.TAG self.writer = tb.TensorboardWriter(cfg) self.model_incorrect_classes = set() def _pick_wrong_preds(self, labels, preds): """ Returns a 1D tensor that contains the indices of instances that have wrong predictions, where true labels in in the specified subset. Args: labels (tensor): tensor of shape (n_instances,) containing class ids. preds (tensor): class scores from model, shape (n_intances, n_classes) Returns: mask (tensor): boolean tensor. `mask[i]` is True if `model` makes a wrong prediction. """ subset_mask = torch.ones(size=(len(labels), ), dtype=torch.bool) if self.subset is not None: for i, label in enumerate(labels): if label not in self.subset: subset_mask[i] = False preds_ids = torch.argmax(preds, dim=-1) mask = preds_ids != labels mask &= subset_mask for i, wrong_pred in enumerate(mask): if wrong_pred: self.model_incorrect_classes.add(labels[i]) return mask def visualize_vid(self, video_input, labels, preds, batch_idx): """ Draw predicted labels on video inputs and visualize all incorrectly classified videos in the current batch. Args: video_input (list of list of tensor(s)): list of videos for all pathways. labels (array-like): shape (n_instances,) of true label for each instance. preds (tensor): shape (n, instances, n_classes). The predicted scores for all instances. tag (Optional[str]): all visualized video will be added under this tag. This is for organization purposes in Tensorboard. batch_idx (int): batch index of the current videos. """ def add_video(vid, preds, tag, true_class_name): """ Draw predicted label on video and add it to Tensorboard. Args: vid (array-like): shape (C, T, H, W). Each image in `vid` is a RGB image. preds (tensor): shape (n_classes,) or (1, n_classes). The predicted scores for the current `vid`. tag (str): tag for `vid` in Tensorboard. true_class_name (str): the ground-truth class name of the current `vid` instance. """ # Permute to (T, H, W, C). vid = vid.permute(1, 2, 3, 0) vid = data_utils.revert_tensor_normalize(vid.cpu(), self.cfg.DATA.MEAN, self.cfg.DATA.STD) vid = self.video_vis.draw_clip(vid, preds) vid = torch.from_numpy(np.array(vid)).permute(0, 3, 1, 2) vid = torch.unsqueeze(vid, dim=0) self.writer.add_video(vid, tag="{}: {}".format(tag, true_class_name)) mask = self._pick_wrong_preds(labels, preds) video_indices = torch.squeeze(mask.nonzero(), dim=-1) # Visualize each wrongly classfied video. for vid_idx in video_indices: cur_vid_idx = batch_idx * len(video_input[0]) + vid_idx for pathway in range(len(video_input)): add_video( video_input[pathway][vid_idx], preds=preds[vid_idx], tag=self.tag + "/Video {}, Pathway {}".format(cur_vid_idx, pathway), true_class_name=self.class_names[labels[vid_idx]], ) @property def wrong_class_prediction(self): """ Return class ids that the model predicted incorrectly. """ incorrect_class_names = [ self.class_names[i] for i in self.model_incorrect_classes ] return list(set(incorrect_class_names)) def clean(self): """ Close Tensorboard writer. """ self.writer.close()
def run_visualization(vis_loader, model, cfg, writer=None): """ Run model visualization (weights, activations and model inputs) and visualize them on Tensorboard. Args: vis_loader (loader): video visualization loader. model (model): the video model to visualize. cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py writer (TensorboardWriter, optional): TensorboardWriter object to writer Tensorboard log. """ n_devices = cfg.NUM_GPUS * cfg.NUM_SHARDS prefix = "module/" if n_devices > 1 else "" # Get a list of selected layer names and indexing. layer_ls, indexing_dict = process_layer_index_data( cfg.TENSORBOARD.MODEL_VIS.LAYER_LIST, layer_name_prefix=prefix) logger.info("Start Model Visualization.") # Register hooks for activations. model_vis = GetWeightAndActivation(model, layer_ls) if writer is not None and cfg.TENSORBOARD.MODEL_VIS.MODEL_WEIGHTS: layer_weights = model_vis.get_weights() writer.plot_weights_and_activations(layer_weights, tag="Layer Weights/", heat_map=False) video_vis = VideoVisualizer( cfg.MODEL.NUM_CLASSES, cfg.TENSORBOARD.CLASS_NAMES_PATH, cfg.TENSORBOARD.MODEL_VIS.TOPK_PREDS, cfg.TENSORBOARD.MODEL_VIS.COLORMAP, ) logger.info("Finish drawing weights.") global_idx = -1 for inputs, _, _, meta in vis_loader: # Transfer the data to the current GPU device. if isinstance(inputs, (list, )): for i in range(len(inputs)): inputs[i] = inputs[i].cuda(non_blocking=True) else: inputs = inputs.cuda(non_blocking=True) for key, val in meta.items(): if isinstance(val, (list, )): for i in range(len(val)): val[i] = val[i].cuda(non_blocking=True) else: meta[key] = val.cuda(non_blocking=True) if cfg.DETECTION.ENABLE: activations, preds = model_vis.get_activations( inputs, meta["boxes"]) else: activations, preds = model_vis.get_activations(inputs) inputs = du.all_gather_unaligned(inputs) activations = du.all_gather_unaligned(activations) preds = du.all_gather_unaligned(preds) boxes = [None] * n_devices if cfg.DETECTION.ENABLE: boxes = du.all_gather_unaligned(meta["boxes"]) if writer is not None: total_vids = 0 for i in range(n_devices): cur_input = inputs[i] cur_activations = activations[i] cur_batch_size = cur_input[0].shape[0] cur_preds = preds[i].cpu() cur_boxes = boxes[i] for cur_batch_idx in range(cur_batch_size): global_idx += 1 total_vids += 1 if cfg.TENSORBOARD.MODEL_VIS.INPUT_VIDEO: for path_idx, input_pathway in enumerate(cur_input): if (cfg.TEST.DATASET == "ava" or cfg.TEST.DATASET == "custom") and cfg.AVA.BGR: video = input_pathway[cur_batch_idx, [2, 1, 0], ...] else: video = input_pathway[cur_batch_idx] # Permute to (T, H, W, C) from (C, T, H, W). video = video.permute(1, 2, 3, 0) video = data_utils.revert_tensor_normalize( video.cpu(), cfg.DATA.MEAN, cfg.DATA.STD) bboxes = (None if cur_boxes is None else cur_boxes[:, 1:].cpu()) video = video_vis.draw_clip(video, cur_preds, bboxes=bboxes) video = (torch.Tensor(video).permute( 0, 3, 1, 2).unsqueeze(0)) writer.add_video( video, tag="Input {}/Input from pathway {}".format( global_idx, path_idx + 1), ) if cfg.TENSORBOARD.MODEL_VIS.ACTIVATIONS: writer.plot_weights_and_activations( cur_activations, tag="Input {}/Activations: ".format(global_idx), batch_idx=cur_batch_idx, indexing_dict=indexing_dict, ) logger.info("Visualized {} videos...".format(total_vids))