def __getitem__(self, index): # Decode video. Meta info is used to perform selective decoding. frames = self.sample_frames(index) # Perform color normalization. frames = utils.tensor_normalize(frames, self.cfg.DATA.MEAN, self.cfg.DATA.STD) # T H W C -> C T H W. frames = frames.permute(3, 0, 1, 2) frames = utils.pack_pathway_output(self.cfg, frames) return frames, index
def process_cv2_inputs(frames, cfg): """ Normalize and prepare inputs as a list of tensors. Each tensor correspond to a unique pathway. Args: frames (list of array): list of input images (correspond to one clip) in range [0, 255]. cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ inputs = torch.from_numpy(np.array(frames)).float() / 255 inputs = tensor_normalize(inputs, cfg.DATA.MEAN, cfg.DATA.STD) # T H W C -> C T H W. inputs = inputs.permute(3, 0, 1, 2) # Sample frames for num_frames specified. index = torch.linspace(0, inputs.shape[1] - 1, cfg.DATA.NUM_FRAMES).long() inputs = torch.index_select(inputs, 1, index) inputs = pack_pathway_output(cfg, inputs) inputs = [inp.unsqueeze(0) for inp in inputs] return inputs
def slowfast_predict( self, frames, labels, ): # slow fast start_time = time.time() inputs = torch.from_numpy(np.array(frames)).float() / 255.0 print("frame change time is :", time.time() - start_time) inputs = tensor_normalize(inputs, self.cfg.DATA.MEAN, self.cfg.DATA.STD) inputs = inputs.permute(3, 0, 1, 2) inputs = inputs.unsqueeze(0) index = torch.linspace(0, inputs.shape[2] - 1, self.cfg.DATA.NUM_FRAMES).long() fast_pathway = torch.index_select(inputs, 2, index) # Sample frames for the slow pathway. index = torch.linspace( 0, fast_pathway.shape[2] - 1, fast_pathway.shape[2] // self.cfg.SLOWFAST.ALPHA).long() slow_pathway = torch.index_select(fast_pathway, 2, index) # logger.info('slow_pathway.shape={}'.format(slow_pathway.shape)) inputs = [slow_pathway, fast_pathway] # Transfer the data to the current GPU device. if isinstance(inputs, (list, )): for i in range(len(inputs)): inputs[i] = inputs[i].cuda() else: inputs = inputs.cuda() boxes = self.queue_demo.get() if not len(boxes): preds = torch.tensor([]) else: preds = self.model(inputs, boxes) if self.cfg.NUM_GPUS > 1: preds = du.all_gather(preds)[0] preds = preds.cpu().detach().numpy() pred_masks = preds > .1 label_ids = [np.nonzero(pred_mask)[0] for pred_mask in pred_masks] pred_labels = [[labels[label_id] for label_id in perbox_label_ids] for perbox_label_ids in label_ids] # boxes = boxes.cpu().detach().numpy() # ratio = np.min( # [self.frame_provider.display_height, self.frame_provider.display_width] # ) / self.cfg.DATA.TEST_CROP_SIZE # # boxes = boxes[:, 1:] * ratio detection_time = time.time() print(f'slowfast cost time is :{(detection_time-start_time)}') # re=pred_labels self.fra(pred_labels)
def demo(cfg): """ Run inference on an input video or stream from webcam. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging() # Print config. logger.info("Run demo with config:") logger.info(cfg) # Build the video model and print model statistics. model = build.build_model(cfg) model.eval() misc.log_model_info(model, cfg) # Load a checkpoint to test if applicable. if cfg.TEST.CHECKPOINT_FILE_PATH != "": ckpt = cfg.TEST.CHECKPOINT_FILE_PATH elif cu.has_checkpoint(cfg.OUTPUT_DIR): ckpt = cu.get_last_checkpoint(cfg.OUTPUT_DIR) elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "": # If no checkpoint found in TEST.CHECKPOINT_FILE_PATH or in the current # checkpoint folder, try to load checkpoint from # TRAIN.CHECKPOINT_FILE_PATH and test it. ckpt = cfg.TRAIN.CHECKPOINT_FILE_PATH else: raise NotImplementedError("Unknown way to load checkpoint.") cu.load_checkpoint( ckpt, model, cfg.NUM_GPUS > 1, None, inflation=False, convert_from_caffe2="caffe2" in [cfg.TEST.CHECKPOINT_TYPE, cfg.TRAIN.CHECKPOINT_TYPE], ) if cfg.DETECTION.ENABLE: # Load object detector from detectron2. dtron2_cfg_file = cfg.DEMO.DETECTRON2_OBJECT_DETECTION_MODEL_CFG dtron2_cfg = get_cfg() dtron2_cfg.merge_from_file(model_zoo.get_config_file(dtron2_cfg_file)) dtron2_cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5 dtron2_cfg.MODEL.WEIGHTS = ( cfg.DEMO.DETECTRON2_OBJECT_DETECTION_MODEL_WEIGHTS) logger.info("Initialize detectron2 model.") object_predictor = DefaultPredictor(dtron2_cfg) # Load the labels of AVA dataset with open(cfg.DEMO.LABEL_FILE_PATH) as f: labels = f.read().split("\n")[:-1] palette = np.random.randint(64, 128, (len(labels), 3)).tolist() boxes = [] logger.info("Finish loading detectron2") else: # Load the labels of Kinectics-400 dataset. labels_df = pd.read_csv(cfg.DEMO.LABEL_FILE_PATH) labels = labels_df["name"].values frame_provider = VideoReader(cfg) seq_len = cfg.DATA.NUM_FRAMES * cfg.DATA.SAMPLING_RATE frames = [] pred_labels = [] s = 0.0 for able_to_read, frame in frame_provider: if not able_to_read: # when reaches the end frame, clear the buffer and continue to the next one. frames = [] break if len(frames) != seq_len: frame_processed = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) frame_processed = scale(cfg.DATA.TEST_CROP_SIZE, frame_processed) frames.append(frame_processed) if cfg.DETECTION.ENABLE and len(frames) == seq_len // 2 - 1: mid_frame = frame if len(frames) == seq_len: start = time() if cfg.DETECTION.ENABLE: outputs = object_predictor(mid_frame) fields = outputs["instances"]._fields pred_classes = fields["pred_classes"] selection_mask = pred_classes == 0 # acquire person boxes. pred_classes = pred_classes[selection_mask] pred_boxes = fields["pred_boxes"].tensor[selection_mask] boxes = cv2_transform.scale_boxes( cfg.DATA.TEST_CROP_SIZE, pred_boxes, frame_provider.display_height, frame_provider.display_width, ) boxes = torch.cat( [torch.full((boxes.shape[0], 1), float(0)).cuda(), boxes], axis=1, ) inputs = tensor_normalize(torch.as_tensor(frames), cfg.DATA.MEAN, cfg.DATA.STD) # T H W C -> C T H W. inputs = inputs.permute(3, 0, 1, 2) # 1 C T H W. inputs = inputs.unsqueeze(0) if cfg.MODEL.ARCH in cfg.MODEL.SINGLE_PATHWAY_ARCH: # Sample frames for the fast pathway. index = torch.linspace(0, inputs.shape[2] - 1, cfg.DATA.NUM_FRAMES).long() inputs = [torch.index_select(inputs, 2, index)] elif cfg.MODEL.ARCH in cfg.MODEL.MULTI_PATHWAY_ARCH: # Sample frames for the fast pathway. index = torch.linspace(0, inputs.shape[2] - 1, cfg.DATA.NUM_FRAMES).long() fast_pathway = torch.index_select(inputs, 2, index) # Sample frames for the slow pathway. index = torch.linspace( 0, fast_pathway.shape[2] - 1, fast_pathway.shape[2] // cfg.SLOWFAST.ALPHA, ).long() slow_pathway = torch.index_select(fast_pathway, 2, index) inputs = [slow_pathway, fast_pathway] else: raise NotImplementedError("Model arch {} is not in {}".format( cfg.MODEL.ARCH, cfg.MODEL.SINGLE_PATHWAY_ARCH + cfg.MODEL.MULTI_PATHWAY_ARCH, )) # Transfer the data to the current GPU device. if isinstance(inputs, (list, )): for i in range(len(inputs)): inputs[i] = inputs[i].cuda(non_blocking=True) else: inputs = inputs.cuda(non_blocking=True) # Perform the forward pass. if cfg.DETECTION.ENABLE: # When there is nothing in the scene, # use a dummy variable to disable all computations below. if not len(boxes): preds = torch.tensor([]) else: preds = model(inputs, boxes) else: preds = model(inputs) # Gather all the predictions across all the devices to perform ensemble. if cfg.NUM_GPUS > 1: preds = du.all_gather(preds)[0] if cfg.DETECTION.ENABLE: # This post processing was intendedly assigned to the cpu since my laptop GPU # RTX 2080 runs out of its memory, if your GPU is more powerful, I'd recommend # to change this section to make CUDA does the processing. preds = preds.cpu().detach().numpy() pred_masks = preds > 0.1 label_ids = [ np.nonzero(pred_mask)[0] for pred_mask in pred_masks ] pred_labels = [[ labels[label_id] for label_id in perbox_label_ids ] for perbox_label_ids in label_ids] # I'm unsure how to detectron2 rescales boxes to image original size, so I use # input boxes of slowfast and rescale back it instead, it's safer and even if boxes # was not rescaled by cv2_transform.rescale_boxes, it still works. boxes = boxes.cpu().detach().numpy() ratio = (np.min([ frame_provider.display_height, frame_provider.display_width, ]) / cfg.DATA.TEST_CROP_SIZE) boxes = boxes[:, 1:] * ratio else: ## Option 1: single label inference selected from the highest probability entry. # label_id = preds.argmax(-1).cpu() # pred_label = labels[label_id] # Option 2: multi-label inferencing selected from probability entries > threshold. label_ids = (torch.nonzero( preds.squeeze() > 0.1).reshape(-1).cpu().detach().numpy()) pred_labels = labels[label_ids] logger.info(pred_labels) if not list(pred_labels): pred_labels = ["Unknown"] # # option 1: remove the oldest frame in the buffer to make place for the new one. # frames.pop(0) # option 2: empty the buffer frames = [] s = time() - start if cfg.DETECTION.ENABLE and pred_labels and boxes.any(): for box, box_labels in zip(boxes.astype(int), pred_labels): cv2.rectangle( frame, tuple(box[:2]), tuple(box[2:]), (0, 255, 0), thickness=2, ) label_origin = box[:2] for label in box_labels: label_origin[-1] -= 5 (label_width, label_height), _ = cv2.getTextSize( label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2) cv2.rectangle( frame, (label_origin[0], label_origin[1] + 5), ( label_origin[0] + label_width, label_origin[1] - label_height - 5, ), palette[labels.index(label)], -1, ) cv2.putText( frame, label, tuple(label_origin), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1, ) label_origin[-1] -= label_height + 5 if not cfg.DETECTION.ENABLE: # Display predicted labels to frame. y_offset = 50 cv2.putText( frame, "Action:", (10, y_offset), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=0.65, color=(0, 235, 0), thickness=2, ) for pred_label in pred_labels: y_offset += 30 cv2.putText( frame, "{}".format(pred_label), (20, y_offset), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=0.65, color=(0, 235, 0), thickness=2, ) # Display prediction speed. cv2.putText( frame, "Speed: {:.2f}s".format(s), (10, 25), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=0.65, color=(0, 235, 0), thickness=2, ) frame_provider.display(frame) # hit Esc to quit the demo. key = cv2.waitKey(1) if key == 27: break frame_provider.clean()
def __getitem__(self, index): """ Given the video index, return the list of frames, label, and video index if the video can be fetched and decoded successfully, otherwise repeatly find a random video that can be decoded as a replacement. Args: index (int): the video index provided by the pytorch sampler. Returns: frames (tensor): the frames of sampled from the video. The dimension is `channel` x `num frames` x `height` x `width`. label (int): the label of the current video. index (int): if the video provided by pytorch sampler can be decoded, then return the index of the video. If not, return the index of the video replacement that can be decoded. """ short_cycle_idx = None # When short cycle is used, input index is a tupple. if isinstance(index, tuple): index, short_cycle_idx = index temporal_sample_index = self._spatial_temporal_idx[ index] // self._number_of_spatial_crops spatial_sample_index = self._spatial_temporal_idx[ index] % self._number_of_spatial_crops sampling_rate = 2 # Try to decode and sample a clip from a video. If the video can not be # decoded, repeatly find a random video replacement that can be decoded. for i_try in range(self._num_retries): video_container = None try: video_container = container.get_video_container( self._path_to_videos[index], False, "pyav", ) except Exception as e: index = random.randint(0, len(self._path_to_videos) - 1) # Select a random video if the current video was not able to access. if video_container is None: if index + 1 < len(self._path_to_videos): index = index + 1 else: index = index - 1 continue NUM_ENSEMBLE_VIEWS = 10 NUM_FRAMES = 64 # Decode video. Meta info is used to perform selective decoding. frames = decoder.decode( container=video_container, sampling_rate=sampling_rate, num_frames=NUM_FRAMES, clip_idx=temporal_sample_index, num_clips=NUM_ENSEMBLE_VIEWS, video_meta=None, target_fps=30, backend="pyav", max_spatial_scale=256, ) # If decoding failed (wrong format, video is too short, and etc), # select another video. if frames is None: print('try a new one') if index + 1 < len(self._path_to_videos): index = index + 1 else: index = index - 1 continue datamean = [0.45, 0.45, 0.45] datastd = [0.225, 0.225, 0.225] #used by CLIP: datamean = [0.48145466, 0.4578275, 0.40821073] datastd = [0.26862954, 0.26130258, 0.27577711] frames = utils.tensor_normalize(frames, datamean, datastd) frames = frames.permute(3, 0, 1, 2) frames = transform.crop_EAC_image(frames, spatial_sample_index) label = self._labels[index] # Perform color normalization. '''if frames.dtype == torch.uint8: frames = frames.float() frames = frames / 255.0 if type(datamean) == list: mean = torch.tensor(datamean) if type(datastd) == list: std = torch.tensor(datastd) frames = frames - mean frames = frames / std frames = frames.permute(0, 3, 1, 2) frames = transform.rescale(frames) frames = transform.uniform_crop(frames[0], 244)''' '''print(frames.size()) a = frames.numpy() a = (a/a)*255 a = a.astype(np.uint8) im = Image.fromarray(a[0,:,:,:]) im.save("00.jpeg")''' '''im = Image.fromarray(a[1,:,:,:]) im.save("01.jpeg") im = Image.fromarray(a[2,:,:,:]) im.save("02.jpeg") im = Image.fromarray(a[3,:,:,:]) im.save("03.jpeg") im = Image.fromarray(a[4,:,:,:]) im.save("04.jpeg")''' #frames = frames.permute(0, 2, 3, 1) # T H W C -> C T H W. ???? #frames = frames.permute(3, 0, 1, 2) #label = self._labels[index] return frames, label, index, {}, self._unique_video_idx[ index], temporal_sample_index, spatial_sample_index, self._path_to_videos[ index] else: raise RuntimeError( "Failed to fetch video after {} retries.".format( self._num_retries))
def __call__(self, task): """ Returns the prediction results for the current task. Args: task (TaskInfo object): task object that contain the necessary information for action prediction. (e.g. frames, boxes) Returns: task (TaskInfo object): the same task info object but filled with prediction values (a tensor) and the corresponding boxes for action detection task. """ # * ------ 1. first stage : starting detection ----------------------*/ if self.cfg.DETECTION.ENABLE: task = self.object_detector(task) # * ------ 2. Second stage : starting recognition ----------------------*/ frames, bboxes = task.frames, task.bboxes ################################################################################################################ from slowfast.datasets.utils import pack_pathway_output, tensor_normalize from torchvision import transforms from PIL import Image if self.cfg.DEMO.INPUT_FORMAT == "BGR": frames = [ cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) for frame in frames ] inputs1 = [] inputs0 = [] cv2_transform.lineSpace(0, 63, 32, frames, inputs1) cv2_transform.lineSpace(0, 31, 8, inputs1, inputs0) inputs0 = [ cv2_transform.scale(self.cfg.DATA.TEST_CROP_SIZE, frame) for frame in inputs0 ] inputs1 = [ cv2_transform.scale(self.cfg.DATA.TEST_CROP_SIZE, frame) for frame in inputs1 ] inputs0 = torch.from_numpy(np.array(inputs0)).float() / 255 inputs1 = torch.from_numpy(np.array(inputs1)).float() / 255 inputs0 = tensor_normalize(inputs0, self.cfg.DATA.MEAN, self.cfg.DATA.STD) inputs1 = tensor_normalize(inputs1, self.cfg.DATA.MEAN, self.cfg.DATA.STD) # T H W C -> C T H W. inputs0 = inputs0.permute(3, 0, 1, 2) inputs1 = inputs1.permute(3, 0, 1, 2) inputs0 = inputs0.unsqueeze(0) inputs1 = inputs1.unsqueeze(0) inputs = [inputs0, inputs1] ############################################################################################################### if bboxes is not None: bboxes = cv2_transform.scale_boxes( self.cfg.DATA.TEST_CROP_SIZE, bboxes, task.img_height, task.img_width, ) # if self.cfg.DEMO.INPUT_FORMAT == "BGR": # frames = [ # cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) for frame in frames # ] # frames = [ # cv2_transform.scale(self.cfg.DATA.TEST_CROP_SIZE, frame) # for frame in frames # ] # change frames to slowfast inputs # inputs = process_cv2_inputs(frames, self.cfg) # add person cls to bbox if bboxes is not None: index_pad = torch.full( size=(bboxes.shape[0], 1), fill_value=float(0), device=bboxes.device, ) # Pad frame index for each box. bboxes = torch.cat([index_pad, bboxes], axis=1) if self.cfg.NUM_GPUS > 0: # Transfer the data to the current GPU device. if isinstance(inputs, (list, )): for i in range(len(inputs)): inputs[i] = inputs[i].cuda(device=torch.device( self.gpu_id), non_blocking=True) else: inputs = inputs.cuda(device=torch.device(self.gpu_id), non_blocking=True) if self.cfg.DETECTION.ENABLE and not bboxes.shape[0]: preds = torch.tensor([]) else: # change {1,3,8,224,224]->[8,3,224,224] bboxes = bboxes.unsqueeze(0).unsqueeze(0) inputs[0] = inputs[0].squeeze(0).permute(1, 0, 2, 3) inputs[1] = inputs[1].squeeze(0).permute(1, 0, 2, 3) ########################################################## import numpy numpy.set_printoptions(suppress=True) # import scipy.io as io # inputs0 = inputs[0].squeeze(0).permute( # 1, 0, 2, 3)[0].permute(1, 2, 0).data.cpu().numpy() # cv2.imwrite("1.jpg", np.array( # inputs0*255, dtype=np.float32)) # dtype=np.uint8 # print(inputs0) # numpy.save("input0.npy", inputs0) # result0 = numpy.array(inputs0.reshape(-1, 1)) # numpy.savetxt("result0.txt", result0) # io.savemat("save.mat", {"result0": result0}) ####################### save .txt file ############################ # result0 = numpy.array( # inputs[0].cpu().reshape(-1, 1)).astype(np.float32) # # result0 = result0.astype('float') # # for i in range(10): # # print(result0[i]) # # exit(0) # result0.astype('float32').tofile("input0.txt") # result1 = numpy.array( # inputs[1].cpu().reshape(-1, 1)).astype(np.float32) # result1.astype('float32').tofile("input1.txt") # result0 = numpy.array( # bboxes.cpu().reshape(-1, 1)).astype(np.float32) # result0.astype('float32').tofile("input2.txt") ##################################### save .npy file ################### # numpy.save("input0.npy", inputs[0].cpu().numpy()) # numpy.save("input1.npy", inputs[1].cpu().numpy()) # numpy.save("input2.npy", bboxes.cpu().numpy()) # input0 = torch.from_numpy(np.load("input0.npy")).cuda() # input1 = torch.from_numpy(np.load("input1.npy")).cuda() # input2 = torch.from_numpy(np.load("input2.npy")).cuda() ########################################################## preds = self.model(inputs, bboxes) # preds = self.model([input0, input1], input2) # result_pred = numpy.array(preds.detach().cpu().reshape(-1, 1)) # numpy.savetxt("result_preds.txt", result_pred) print(preds) exit(0) #***************************** open with video test ########################## bboxes = bboxes.squeeze(0).squeeze(0) # change[1,1,3,5] -->[3,5] #***************************** open with video test end ########################## if self.cfg.NUM_GPUS: preds = preds.cpu() if bboxes is not None: bboxes = bboxes.detach().cpu() preds = preds.detach() task.add_action_preds(preds) if bboxes is not None: task.add_bboxes(bboxes[:, 1:]) return task