def demo(cfg, backbone): # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Build the video model and print model statistics. model = model_builder.build_model(cfg) model.eval() misc.log_model_info(model) # Load a checkpoint to test if applicable. if cfg.TEST.CHECKPOINT_FILE_PATH != "": ckpt = cfg.TEST.CHECKPOINT_FILE_PATH elif cu.has_checkpoint(cfg.OUTPUT_DIR): ckpt = cu.get_last_checkpoint(cfg.OUTPUT_DIR) elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "": ckpt = cfg.TRAIN.CHECKPOINT_FILE_PATH else: raise NotImplementedError("Unknown way to load checkpoint.") cu.load_checkpoint( ckpt, model, cfg.NUM_GPUS > 1, None, inflation=False, convert_from_caffe2= "caffe2" in [cfg.TEST.CHECKPOINT_TYPE, cfg.TRAIN.CHECKPOINT_TYPE], ) darknetlib_path = '/home/ubuntu/hanhbd/SlowFast/detector/libdarknet.so' config_path = '/home/ubuntu/hanhbd/SlowFast/detector/yolov4.cfg' meta_path = '/home/ubuntu/hanhbd/SlowFast/detector/coco.data' classes_path = '/home/ubuntu/hanhbd/SlowFast/detector/coco.names' weight_path = '/home/ubuntu/hanhbd/SlowFast/detector/yolov4.weights' if backbone == 'yolo': object_predictor = YOLO.get_instance(darknetlib_path, config_path, meta_path, classes_path, weight_path) else: dtron2_cfg_file = cfg.DEMO.DETECTRON2_OBJECT_DETECTION_MODEL_CFG dtron2_cfg = get_cfg() dtron2_cfg.merge_from_file(model_zoo.get_config_file(dtron2_cfg_file)) dtron2_cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = .5 dtron2_cfg.MODEL.WEIGHTS = cfg.DEMO.DETECTRON2_OBJECT_DETECTION_MODEL_WEIGHTS object_predictor = DefaultPredictor(dtron2_cfg) with open(cfg.DEMO.LABEL_FILE_PATH) as f: labels = f.read().split('\n')[:-1] palette = np.random.randint(64, 128, (len(labels), 3)).tolist() count_xxx = 0 seq_len = cfg.DATA.NUM_FRAMES*cfg.DATA.SAMPLING_RATE frames = [] org_frames = [] mid_frame = None pred_labels = [] draw_imgs = [] cap = cv2.VideoCapture(cfg.DEMO.DATA_SOURCE) was_read, frame = cap.read() display_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) display_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = int(cap.get(cv2.CAP_PROP_FPS)) fourcc = cv2.VideoWriter_fourcc(*'DIVX') videowriter = cv2.VideoWriter('./result/testset_fighting_05.avi',fourcc, fps, (display_width,display_height)) while was_read : was_read, frame = cap.read() if not was_read: videowriter.release() break if len(frames) != seq_len: frame_processed = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) frame_processed = scale(cfg.DATA.TEST_CROP_SIZE, frame_processed) frames.append(frame_processed) org_frames.append(frame) else: #predict all person box in all frame start = time() mid_frame = org_frames[seq_len//2 - 2] # just draw half of number frame because we will use slide = 1/2 length of sequence if cfg.DETECTION.ENABLE and len(draw_imgs) == 0: for idx in range(seq_len//2 - 1): image = org_frames[idx] boxes = detector(object_predictor , image, backbone, cfg , display_height, display_width ) # boxes = object_predictor.detect_image(img) # boxes = torch.as_tensor(boxes).float().cuda() boxes = torch.cat([torch.full((boxes.shape[0], 1), float(0)).cuda(), boxes], axis=1) boxes = boxes.cpu().detach().numpy() if backbone == 'yolo': boxes = boxes[:, 1:] else: ratio = np.min( [display_height, display_width] ) / cfg.DATA.TEST_CROP_SIZE boxes = boxes[:, 1:] * ratio for box in boxes: xmin, ymin, xmax, ymax = box cv2.rectangle(image, (xmin, ymin), (xmax , ymax), (0, 255, 0), thickness=2) draw_imgs.append(image) # detect box in mid frame if cfg.DETECTION.ENABLE: boxes = detector(object_predictor , mid_frame, backbone, cfg , display_height, display_width ) boxes = torch.cat([torch.full((boxes.shape[0], 1), float(0)).cuda(), boxes], axis=1) inputs = torch.from_numpy(np.array(frames)).float() inputs = inputs / 255.0 # Perform color normalization. inputs = inputs - torch.tensor(cfg.DATA.MEAN) inputs = inputs / torch.tensor(cfg.DATA.STD) # T H W C -> C T H W. inputs = inputs.permute(3, 0, 1, 2) # 1 C T H W. inputs = inputs.unsqueeze(0) # Sample frames for the fast pathway. index = torch.linspace(0, inputs.shape[2] - 1, cfg.DATA.NUM_FRAMES).long() fast_pathway = torch.index_select(inputs, 2, index) # Sample frames for the slow pathway. index = torch.linspace(0, fast_pathway.shape[2] - 1, fast_pathway.shape[2]//cfg.SLOWFAST.ALPHA).long() slow_pathway = torch.index_select(fast_pathway, 2, index) inputs = [slow_pathway, fast_pathway] # Transfer the data to the current GPU device. if isinstance(inputs, (list,)): for i in range(len(inputs)): inputs[i] = inputs[i].cuda(non_blocking=True) else: inputs = inputs.cuda(non_blocking=True) # use a dummy variable to disable all computations below. if not len(boxes): preds = torch.tensor([]) else: preds = model(inputs, boxes) # Gather all the predictions across all the devices to perform ensemble. if cfg.NUM_GPUS > 1: preds = du.all_gather(preds)[0] # post processing preds = preds.cpu().detach().numpy() pred_masks = preds > .1 label_ids = [np.nonzero(pred_mask)[0] for pred_mask in pred_masks] pred_labels = [ [labels[label_id] for label_id in perbox_label_ids] for perbox_label_ids in label_ids ] print(pred_labels) boxes = boxes.cpu().detach().numpy() if backbone == 'yolo': boxes = boxes[:, 1:] else: ratio = np.min( [display_height, display_width] ) / cfg.DATA.TEST_CROP_SIZE boxes = boxes[:, 1:] * ratio # draw result on mid frame if pred_labels and boxes.any(): for box, box_labels in zip(boxes.astype(int), pred_labels): xmin, ymin, xmax, ymax = box cv2.rectangle(mid_frame, (xmin, ymin), (xmax , ymax), (0, 255, 0), thickness=2) label_origin = box[:2] for label in box_labels: label_origin[-1] -= 5 (label_width, label_height), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, .5, 2) cv2.rectangle( mid_frame, (label_origin[0], label_origin[1] + 5), (label_origin[0] + label_width, label_origin[1] - label_height - 5), palette[labels.index(label)], -1 ) cv2.putText( mid_frame, label, tuple(label_origin), cv2.FONT_HERSHEY_SIMPLEX, .5, (255, 255, 255), 1 ) label_origin[-1] -= label_height + 5 # append mid frame to the draw array draw_imgs.append(mid_frame) # write image to videos for img_ in draw_imgs: videowriter.write(img_) print("time process", (time() - start) /64 ) # clean the buffer of frames and org_frames with slide 1/2 seq_len # frames = frames[seq_len//2 - 1:] # org_frames = org_frames[seq_len//2 - 1:] frames = frames[1:] org_frames = org_frames[1:] draw_imgs = draw_imgs[-1:] count_xxx += 1
def demo(cfg): """ Run inference on an input video or stream from webcam. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging() # Print config. logger.info("Run demo with config:") logger.info(cfg) # Build the video model and print model statistics. model = build.build_model(cfg) model.eval() misc.log_model_info(model, cfg) # Load a checkpoint to test if applicable. if cfg.TEST.CHECKPOINT_FILE_PATH != "": ckpt = cfg.TEST.CHECKPOINT_FILE_PATH elif cu.has_checkpoint(cfg.OUTPUT_DIR): ckpt = cu.get_last_checkpoint(cfg.OUTPUT_DIR) elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "": # If no checkpoint found in TEST.CHECKPOINT_FILE_PATH or in the current # checkpoint folder, try to load checkpoint from # TRAIN.CHECKPOINT_FILE_PATH and test it. ckpt = cfg.TRAIN.CHECKPOINT_FILE_PATH else: raise NotImplementedError("Unknown way to load checkpoint.") cu.load_checkpoint( ckpt, model, cfg.NUM_GPUS > 1, None, inflation=False, convert_from_caffe2="caffe2" in [cfg.TEST.CHECKPOINT_TYPE, cfg.TRAIN.CHECKPOINT_TYPE], ) if cfg.DETECTION.ENABLE: # Load object detector from detectron2. dtron2_cfg_file = cfg.DEMO.DETECTRON2_OBJECT_DETECTION_MODEL_CFG dtron2_cfg = get_cfg() dtron2_cfg.merge_from_file(model_zoo.get_config_file(dtron2_cfg_file)) dtron2_cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5 dtron2_cfg.MODEL.WEIGHTS = ( cfg.DEMO.DETECTRON2_OBJECT_DETECTION_MODEL_WEIGHTS) logger.info("Initialize detectron2 model.") object_predictor = DefaultPredictor(dtron2_cfg) # Load the labels of AVA dataset with open(cfg.DEMO.LABEL_FILE_PATH) as f: labels = f.read().split("\n")[:-1] palette = np.random.randint(64, 128, (len(labels), 3)).tolist() boxes = [] logger.info("Finish loading detectron2") else: # Load the labels of Kinectics-400 dataset. labels_df = pd.read_csv(cfg.DEMO.LABEL_FILE_PATH) labels = labels_df["name"].values frame_provider = VideoReader(cfg) seq_len = cfg.DATA.NUM_FRAMES * cfg.DATA.SAMPLING_RATE frames = [] pred_labels = [] s = 0.0 for able_to_read, frame in frame_provider: if not able_to_read: # when reaches the end frame, clear the buffer and continue to the next one. frames = [] break if len(frames) != seq_len: frame_processed = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) frame_processed = scale(cfg.DATA.TEST_CROP_SIZE, frame_processed) frames.append(frame_processed) if cfg.DETECTION.ENABLE and len(frames) == seq_len // 2 - 1: mid_frame = frame if len(frames) == seq_len: start = time() if cfg.DETECTION.ENABLE: outputs = object_predictor(mid_frame) fields = outputs["instances"]._fields pred_classes = fields["pred_classes"] selection_mask = pred_classes == 0 # acquire person boxes. pred_classes = pred_classes[selection_mask] pred_boxes = fields["pred_boxes"].tensor[selection_mask] boxes = cv2_transform.scale_boxes( cfg.DATA.TEST_CROP_SIZE, pred_boxes, frame_provider.display_height, frame_provider.display_width, ) boxes = torch.cat( [torch.full((boxes.shape[0], 1), float(0)).cuda(), boxes], axis=1, ) inputs = tensor_normalize(torch.as_tensor(frames), cfg.DATA.MEAN, cfg.DATA.STD) # T H W C -> C T H W. inputs = inputs.permute(3, 0, 1, 2) # 1 C T H W. inputs = inputs.unsqueeze(0) if cfg.MODEL.ARCH in cfg.MODEL.SINGLE_PATHWAY_ARCH: # Sample frames for the fast pathway. index = torch.linspace(0, inputs.shape[2] - 1, cfg.DATA.NUM_FRAMES).long() inputs = [torch.index_select(inputs, 2, index)] elif cfg.MODEL.ARCH in cfg.MODEL.MULTI_PATHWAY_ARCH: # Sample frames for the fast pathway. index = torch.linspace(0, inputs.shape[2] - 1, cfg.DATA.NUM_FRAMES).long() fast_pathway = torch.index_select(inputs, 2, index) # Sample frames for the slow pathway. index = torch.linspace( 0, fast_pathway.shape[2] - 1, fast_pathway.shape[2] // cfg.SLOWFAST.ALPHA, ).long() slow_pathway = torch.index_select(fast_pathway, 2, index) inputs = [slow_pathway, fast_pathway] else: raise NotImplementedError("Model arch {} is not in {}".format( cfg.MODEL.ARCH, cfg.MODEL.SINGLE_PATHWAY_ARCH + cfg.MODEL.MULTI_PATHWAY_ARCH, )) # Transfer the data to the current GPU device. if isinstance(inputs, (list, )): for i in range(len(inputs)): inputs[i] = inputs[i].cuda(non_blocking=True) else: inputs = inputs.cuda(non_blocking=True) # Perform the forward pass. if cfg.DETECTION.ENABLE: # When there is nothing in the scene, # use a dummy variable to disable all computations below. if not len(boxes): preds = torch.tensor([]) else: preds = model(inputs, boxes) else: preds = model(inputs) # Gather all the predictions across all the devices to perform ensemble. if cfg.NUM_GPUS > 1: preds = du.all_gather(preds)[0] if cfg.DETECTION.ENABLE: # This post processing was intendedly assigned to the cpu since my laptop GPU # RTX 2080 runs out of its memory, if your GPU is more powerful, I'd recommend # to change this section to make CUDA does the processing. preds = preds.cpu().detach().numpy() pred_masks = preds > 0.1 label_ids = [ np.nonzero(pred_mask)[0] for pred_mask in pred_masks ] pred_labels = [[ labels[label_id] for label_id in perbox_label_ids ] for perbox_label_ids in label_ids] # I'm unsure how to detectron2 rescales boxes to image original size, so I use # input boxes of slowfast and rescale back it instead, it's safer and even if boxes # was not rescaled by cv2_transform.rescale_boxes, it still works. boxes = boxes.cpu().detach().numpy() ratio = (np.min([ frame_provider.display_height, frame_provider.display_width, ]) / cfg.DATA.TEST_CROP_SIZE) boxes = boxes[:, 1:] * ratio else: ## Option 1: single label inference selected from the highest probability entry. # label_id = preds.argmax(-1).cpu() # pred_label = labels[label_id] # Option 2: multi-label inferencing selected from probability entries > threshold. label_ids = (torch.nonzero( preds.squeeze() > 0.1).reshape(-1).cpu().detach().numpy()) pred_labels = labels[label_ids] logger.info(pred_labels) if not list(pred_labels): pred_labels = ["Unknown"] # # option 1: remove the oldest frame in the buffer to make place for the new one. # frames.pop(0) # option 2: empty the buffer frames = [] s = time() - start if cfg.DETECTION.ENABLE and pred_labels and boxes.any(): for box, box_labels in zip(boxes.astype(int), pred_labels): cv2.rectangle( frame, tuple(box[:2]), tuple(box[2:]), (0, 255, 0), thickness=2, ) label_origin = box[:2] for label in box_labels: label_origin[-1] -= 5 (label_width, label_height), _ = cv2.getTextSize( label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2) cv2.rectangle( frame, (label_origin[0], label_origin[1] + 5), ( label_origin[0] + label_width, label_origin[1] - label_height - 5, ), palette[labels.index(label)], -1, ) cv2.putText( frame, label, tuple(label_origin), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1, ) label_origin[-1] -= label_height + 5 if not cfg.DETECTION.ENABLE: # Display predicted labels to frame. y_offset = 50 cv2.putText( frame, "Action:", (10, y_offset), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=0.65, color=(0, 235, 0), thickness=2, ) for pred_label in pred_labels: y_offset += 30 cv2.putText( frame, "{}".format(pred_label), (20, y_offset), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=0.65, color=(0, 235, 0), thickness=2, ) # Display prediction speed. cv2.putText( frame, "Speed: {:.2f}s".format(s), (10, 25), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=0.65, color=(0, 235, 0), thickness=2, ) frame_provider.display(frame) # hit Esc to quit the demo. key = cv2.waitKey(1) if key == 27: break frame_provider.clean()
def perform_test(test_loader, model, test_meter, cfg): """ For classification: Perform mutli-view testing that uniformly samples N clips from a video along its temporal axis. For each clip, it takes 3 crops to cover the spatial dimension, followed by averaging the softmax scores across all Nx3 views to form a video-level prediction. All video predictions are compared to ground-truth labels and the final testing performance is logged. For detection: Perform fully-convolutional testing on the full frames without crop. Args: test_loader (loader): video testing loader. model (model): the pretrained video model to test. test_meter (TestMeter): testing meters to log and ensemble the testing results. cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Enable eval mode. model.eval() test_meter.iter_tic() for cur_iter, (inputs, labels, video_idx, meta) in enumerate(test_loader): # Transfer the data to the current GPU device. if isinstance(inputs, (list, )): for i in range(len(inputs)): inputs[i] = inputs[i].cuda(non_blocking=True) print(inputs[i].size()) else: inputs = inputs.cuda(non_blocking=True) print(inputs.size()) # Transfer the data to the current GPU device. labels = labels.cuda() video_idx = video_idx.cuda() for key, val in meta.items(): if isinstance(val, (list, )): for i in range(len(val)): val[i] = val[i].cuda(non_blocking=True) else: meta[key] = val.cuda(non_blocking=True) if cfg.DETECTION.ENABLE: # Compute the predictions. preds = model(inputs, meta["boxes"]) preds = preds.cpu() ori_boxes = meta["ori_boxes"].cpu() metadata = meta["metadata"].cpu() if cfg.NUM_GPUS > 1: preds = torch.cat(du.all_gather_unaligned(preds), dim=0) ori_boxes = torch.cat(du.all_gather_unaligned(ori_boxes), dim=0) metadata = torch.cat(du.all_gather_unaligned(metadata), dim=0) test_meter.iter_toc() # Update and log stats. test_meter.update_stats( preds.detach().cpu(), ori_boxes.detach().cpu(), metadata.detach().cpu(), ) test_meter.log_iter_stats(None, cur_iter) else: # Perform the forward pass. preds = model(inputs) # Gather all the predictions across all the devices to perform ensemble. if cfg.NUM_GPUS > 1: preds, labels, video_idx = du.all_gather( [preds, labels, video_idx]) test_meter.iter_toc() # Update and log stats. test_meter.update_stats( preds.detach().cpu(), labels.detach().cpu(), video_idx.detach().cpu(), ) test_meter.log_iter_stats(cur_iter) test_meter.iter_tic() # Log epoch stats and print the final testing results. test_meter.finalize_metrics() test_meter.reset()
def perform_test(test_loader, model, test_meter, cfg): """ For classification: Perform mutli-view testing that uniformly samples N clips from a video along its temporal axis. For each clip, it takes 3 crops to cover the spatial dimension, followed by averaging the softmax scores across all Nx3 views to form a video-level prediction. All video predictions are compared to ground-truth labels and the final testing performance is logged. For detection: Perform fully-convolutional testing on the full frames without crop. Args: test_loader (loader): video testing loader. model (model): the pretrained video model to test. test_meter (TestMeter): testing meters to log and ensemble the testing results. cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Enable eval mode. model.eval() test_meter.iter_tic() if not cfg.TEST.EXTRACT_MSTCN_FEATURES and cfg.TEST.EXTRACT_FEATURES: x_feat_list = [[],[]] elif cfg.TEST.EXTRACT_MSTCN_FEATURES and cfg.TEST.EXTRACT_FEATURES: x_feat_list = [] # for cur_iter, (inputs, bboxs, masks, labels, video_idx, meta) in enumerate(test_loader): for cur_iter, output_dict in enumerate(test_loader): if cur_iter % 100 == 0: logger.info("Testing iter={}".format(cur_iter)) # if (cur_iter+1) % 1000 == 0: # test_meter_preds, test_meter_labels, test_meter_metadata = test_meter.finalize_metrics() inputs = output_dict['inputs'] labels = output_dict['label'] video_idx = output_dict['index'] meta = output_dict['metadata'] if cfg.EPICKITCHENS.USE_BBOX: bboxs = output_dict['bboxs'] masks = output_dict['masks'] else: bboxs = None masks = None # Transfer the data to the current GPU device. if isinstance(inputs, (list,)): for i in range(len(inputs)): inputs[i] = inputs[i].cuda(non_blocking=True) else: inputs = inputs.cuda(non_blocking=True) # Transfer the data to the current GPU device. if isinstance(labels, (dict,)): labels = {k: v.cuda() for k, v in labels.items()} else: labels = labels.cuda() video_idx = video_idx.cuda() if cfg.DETECTION.ENABLE: for key, val in meta.items(): if isinstance(val, (list,)): for i in range(len(val)): val[i] = val[i].cuda(non_blocking=True) else: meta[key] = val.cuda(non_blocking=True) # Compute the predictions. preds = model(inputs, meta["boxes"]) preds = preds.cpu() ori_boxes = meta["ori_boxes"].cpu() metadata = meta["metadata"].cpu() if cfg.NUM_GPUS > 1: preds = torch.cat(du.all_gather_unaligned(preds), dim=0) ori_boxes = torch.cat(du.all_gather_unaligned(ori_boxes), dim=0) metadata = torch.cat(du.all_gather_unaligned(metadata), dim=0) test_meter.iter_toc() # Update and log stats. test_meter.update_stats( preds.detach().cpu(), ori_boxes.detach().cpu(), metadata.detach().cpu(), ) test_meter.log_iter_stats(None, cur_iter) else: # Perform the forward pass. if cfg.EPICKITCHENS.USE_BBOX: bboxs = to_cuda(bboxs) masks = to_cuda(masks) preds_pair = model(inputs, bboxes=bboxs, masks=masks) else: preds_pair = model(inputs) if cfg.TEST.EXTRACT_FEATURES: preds, x_feat = preds_pair else: preds = preds_pair if isinstance(labels, (dict,)): # Gather all the predictions across all the devices to perform ensemble. if cfg.NUM_GPUS > 1: verb_preds, verb_labels, video_idx = du.all_gather( [preds[0], labels['verb'], video_idx] ) noun_preds, noun_labels, video_idx = du.all_gather( [preds[1], labels['noun'], video_idx] ) meta = du.all_gather_unaligned(meta) metadata = {'narration_id': []} for i in range(len(meta)): metadata['narration_id'].extend(meta[i]['narration_id']) if not cfg.TEST.EXTRACT_MSTCN_FEATURES and cfg.TEST.EXTRACT_FEATURES: x_feat_slow, x_feat_fast = du.all_gather([x_feat[0], x_feat[1]]) #print(x_feat_slow.shape, x_feat_fast.shape) ##torch.Size([8, 2048, 8, 7, 7]) torch.Size([8, 256, 32, 7, 7]) x_feat_list[0] += [x_feat_slow] x_feat_list[1] += [x_feat_fast] elif cfg.TEST.EXTRACT_MSTCN_FEATURES and cfg.TEST.EXTRACT_FEATURES: x_feat = du.all_gather([x_feat]) x_feat_list.append(x_feat[0]) else: metadata = meta verb_preds, verb_labels, video_idx = preds[0], labels['verb'], video_idx noun_preds, noun_labels, video_idx = preds[1], labels['noun'], video_idx if not cfg.TEST.EXTRACT_MSTCN_FEATURES and cfg.TEST.EXTRACT_FEATURES: x_feat_list[0].append(x_feat[0]) x_feat_list[1].append(x_feat[1]) elif cfg.TEST.EXTRACT_MSTCN_FEATURES and cfg.TEST.EXTRACT_FEATURES: x_feat_list.append(x_feat) test_meter.iter_toc() # Update and log stats. test_meter.update_stats( (verb_preds.detach().cpu(), noun_preds.detach().cpu()), (verb_labels.detach().cpu(), noun_labels.detach().cpu()), metadata, video_idx.detach().cpu(), ) # test_meter.log_iter_stats(cur_iter) else: # Gather all the predictions across all the devices to perform ensemble. if cfg.NUM_GPUS > 1: preds, labels, idx = du.all_gather( [preds, labels, video_idx] ) test_meter.iter_toc() # Update and log stats. test_meter.update_stats( preds.detach().cpu(), labels.detach().cpu(), video_idx.detach().cpu(), ) # test_meter.log_iter_stats(cur_iter) test_meter.iter_tic() # Log epoch stats and print the final testing results. if cfg.TEST.DATASET == 'epickitchens': preds, labels, metadata = test_meter.finalize_metrics() else: test_meter.finalize_metrics() preds, labels, metadata = None, None, None test_meter.reset() if cfg.TEST.EXTRACT_FEATURES: if not cfg.TEST.EXTRACT_MSTCN_FEATURES and cfg.TEST.EXTRACT_FEATURES: final_feat_list = [[],[]] final_feat_list[0] = [t.cpu() for t in x_feat_list[0]] final_feat_list[1] = [t.cpu() for t in x_feat_list[1]] elif cfg.TEST.EXTRACT_MSTCN_FEATURES and cfg.TEST.EXTRACT_FEATURES: final_feat_list = [t.cpu() for t in x_feat_list] return preds, labels, metadata, final_feat_list else: return preds, labels, metadata
def process_frames_batch(cfg, frames, mid_frame, frame_provider, object_predictor, model, labels): if cfg.DETECTION.ENABLE: boxes, scores = get_person_boxes(cfg, object_predictor, mid_frame, frame_provider) slow_pathway, fast_pathway = extract_slow_fast_path_from_frames( cfg, frames) # logger.info('slow_pathway.shape={}'.format(slow_pathway.shape)) inputs = [slow_pathway, fast_pathway] # Transfer the data to the current GPU device. if isinstance(inputs, (list, )): for i in range(len(inputs)): inputs[i] = inputs[i].cuda(non_blocking=True) else: inputs = inputs.cuda(non_blocking=True) # Perform the forward pass. if cfg.DETECTION.ENABLE: # When there is nothing in the scene, # use a dummy variable to disable all computations below. if not len(boxes): preds = torch.tensor([]) else: preds = model(inputs, boxes) else: preds = model(inputs) # Gather all the predictions across all the devices to perform ensemble. if cfg.NUM_GPUS > 1: preds = du.all_gather(preds)[0] if cfg.DETECTION.ENABLE: # This post processing was intendedly assigned to the cpu since my laptop GPU # RTX 2080 runs out of its memory, if your GPU is more powerful, I'd recommend # to change this section to make CUDA does the processing. preds = preds.cpu().detach().numpy() print(preds) pred_masks = preds > cfg.DEMO.PREDS_THRESHHOLD label_ids = [np.nonzero(pred_mask)[0] for pred_mask in pred_masks] pred_labels = [[labels[label_id] for label_id in perbox_label_ids] for perbox_label_ids in label_ids] # I'm unsure how to detectron2 rescales boxes to image original size, so I use # input boxes of slowfast and rescale back it instead, it's safer and even if boxes # was not rescaled by cv2_transform.rescale_boxes, it still works. boxes = boxes.cpu().detach().numpy() ratio = np.min([ frame_provider.display_height, frame_provider.display_width ]) / cfg.DATA.TEST_CROP_SIZE boxes = boxes[:, 1:] * ratio else: # Option 1: single label inference selected from the highest probability entry. # label_id = preds.argmax(-1).cpu() # pred_label = labels[label_id] # Option 2: multi-label inferencing selected from probability entries > threshold label_ids = torch.nonzero( preds.squeeze() > .1).reshape(-1).cpu().detach().numpy() pred_labels = labels[label_ids] logger.info(pred_labels) if not list(pred_labels): pred_labels = ['Unknown'] return boxes, pred_labels
def train_epoch(self, train_loader, model, optimizer, train_meter, cur_epoch, cfg, writer=None): """ Perform the video training for one epoch. Args: train_loader (loader): video training loader. model (model): the video model to train. optimizer (optim): the optimizer to perform optimization on the model's parameters. train_meter (TrainMeter): training meters to log the training performance. cur_epoch (int): current epoch of training. cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py writer (TensorboardWriter, optional): TensorboardWriter object to writer Tensorboard log. """ # Enable train mode. model.train() train_meter.iter_tic() data_size = len(train_loader) start = time.time() btch = cfg.TRAIN.BATCH_SIZE * self.cfg.NUM_SHARDS rankE = os.environ.get("RANK", None) worldE = os.environ.get("WORLD_SIZE", None) dSize = data_size * btch self.logger.info( "Train Epoch {} dLen {} Batch {} dSize {} localRank {} rank {} {} world {} {}" .format(cur_epoch, data_size, btch, dSize, du.get_local_rank(), du.get_rank(), rankE, du.get_world_size(), worldE)) tot = 0 first = True predsAll = [] labelsAll = [] for cur_iter, (inputs, labels, _, meta) in enumerate(train_loader): # Transfer the data to the current GPU device. tot += len(labels) if isinstance(inputs, (list, )): if first: self.logger.info( "rank {} LEN {} {} shape Slow {} Fast {} {} tot {}". format(du.get_rank(), len(labels), len(inputs), inputs[0].shape, inputs[1].shape, labels[0].shape, tot)) first = False for i in range(len(inputs)): inputs[i] = inputs[i].cuda(non_blocking=True) else: if first: self.logger.info( "rank {} LEN {} shape {} {} tot {}".format( du.get_rank(), len(labels), inputs.shape, labels[0].shape, tot)) first = False inputs = inputs.cuda(non_blocking=True) labels = labels.cuda() for key, val in meta.items(): if isinstance(val, (list, )): for i in range(len(val)): val[i] = val[i].cuda(non_blocking=True) else: meta[key] = val.cuda(non_blocking=True) # Update the learning rate. lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg) optim.set_lr(optimizer, lr) if cfg.DETECTION.ENABLE: # Compute the predictions. preds = model(inputs, meta["boxes"]) else: # Perform the forward pass. preds = model(inputs) # Explicitly declare reduction to mean. loss_fun = losses.get_loss_func( cfg.MODEL.LOSS_FUNC)(reduction="mean") # Compute the loss. loss = loss_fun(preds, labels) # check Nan Loss. misc.check_nan_losses(loss) # Perform the backward pass. optimizer.zero_grad() loss.backward() # Update the parameters. optimizer.step() if cfg.DETECTION.ENABLE: if cfg.NUM_GPUS > 1: loss = du.all_reduce([loss])[0] loss = loss.item() train_meter.iter_toc() # Update and log stats. train_meter.update_stats(None, None, None, loss, lr) # write to tensorboard format if available. if writer is not None: writer.add_scalars( { "Train/loss": loss, "Train/lr": lr }, global_step=data_size * cur_epoch + cur_iter, ) ite = data_size * cur_epoch + cur_iter if du.is_master_proc(): self.logger.log_row(name='TrainLoss', iter=ite, loss=loss, description="train loss") self.logger.log_row(name='TrainLr', iter=ite, lr=lr, description="train learn rate") else: top1_err, top5_err = None, None if cfg.DATA.MULTI_LABEL: # Gather all the predictions across all the devices. if cfg.NUM_GPUS > 1: [loss] = du.all_reduce([loss]) loss = loss.item() else: # Binary classifier - save preds / labels for metrics if cfg.MODEL.NUM_CLASSES == 2: predsAll.extend(preds.detach().cpu().numpy()[:, -1]) labelsAll.extend(labels.detach().cpu().numpy()) # Compute the errors. num_topks_correct = metrics.topks_correct( preds, labels, (1, min(5, cfg.MODEL.NUM_CLASSES))) top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0 for x in num_topks_correct] # Gather all the predictions across all the devices. if cfg.NUM_GPUS > 1: loss, top1_err, top5_err = du.all_reduce( [loss, top1_err, top5_err]) # Copy the stats from GPU to CPU (sync point). loss, top1_err, top5_err = ( loss.item(), top1_err.item(), top5_err.item(), ) train_meter.iter_toc() # Update and log stats. # self.logger.info("UPDATING stat {} {} {}".format(inputs[0].size(0), cfg.NUM_GPUS, inputs[0].size(0) * cfg.NUM_GPUS)) train_meter.update_stats(top1_err, top5_err, loss, lr, inputs[0].size(0) * cfg.NUM_GPUS) # write to tensorboard format if available. if writer is not None: writer.add_scalars( { "Train/loss": loss, "Train/lr": lr, "Train/Top1_err": top1_err, "Train/Top5_err": top5_err, }, global_step=data_size * cur_epoch + cur_iter, ) stats = train_meter.log_iter_stats(cur_epoch, cur_iter, predsAll, labelsAll) ite = dSize * cur_epoch + btch * (cur_iter + 1) self.plotStats(stats, ite, 'TrainIter') train_meter.iter_tic() if du.is_master_proc() and cfg.LOG_MODEL_INFO: misc.log_model_info(model, cfg, use_train_input=True) # Log epoch stats. gathered = du.all_gather([ torch.tensor(predsAll).to(torch.device("cuda")), torch.tensor(labelsAll).to(torch.device("cuda")) ]) stats = train_meter.log_epoch_stats(cur_epoch, gathered[0].detach().cpu().numpy(), gathered[1].detach().cpu().numpy()) ite = (cur_epoch + 1) * dSize self.plotStats(stats, ite, 'TrainEpoch') train_meter.reset() end = time.time() el = end - start totAll = du.all_reduce([torch.tensor(tot).cuda()], average=False) tSum = totAll[0].item() elT = torch.tensor(el).cuda() elMax = du.all_reduce([elT], op=dist.ReduceOp.MAX, average=False)[0].item() jobRate = tSum / elMax self.logger.info( "totSampCnt {} workerSampCnt {} eTimeMax {} eTimeWorker {} SampPerSecJob {:.1f} SampPerSecWorker {:.1f}" .format(tSum, tot, elMax, el, jobRate, tot / el)) return jobRate
def demo(cfg): # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging() # Print config. logger.info("Run demo with config:") logger.info(cfg) # Build the video model and print model statistics. model = model_builder.build_model(cfg) model.eval() misc.log_model_info(model) # Load a checkpoint to test if applicable. if cfg.TEST.CHECKPOINT_FILE_PATH != "": ckpt = cfg.TEST.CHECKPOINT_FILE_PATH elif cu.has_checkpoint(cfg.OUTPUT_DIR): ckpt = cu.get_last_checkpoint(cfg.OUTPUT_DIR) elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "": # If no checkpoint found in TEST.CHECKPOINT_FILE_PATH or in the current # checkpoint folder, try to load checkpoint from # TRAIN.CHECKPOINT_FILE_PATH and test it. ckpt = cfg.TRAIN.CHECKPOINT_FILE_PATH else: raise NotImplementedError("Unknown way to load checkpoint.") cu.load_checkpoint( ckpt, model, cfg.NUM_GPUS > 1, None, inflation=False, convert_from_caffe2="caffe2" in [cfg.TEST.CHECKPOINT_TYPE, cfg.TRAIN.CHECKPOINT_TYPE], ) # Load the labels of Kinectics-400 dataset labels_df = pd.read_csv(cfg.DEMO.LABEL_FILE_PATH) labels = labels_df['name'].values img_provider = VideoReader(cfg) frames = [] # # Option 1 # pred_label = '' # Option 2 pred_labels = [] s = 0. for able_to_read, frame in img_provider: if not able_to_read: # when reaches the end frame, clear the buffer and continue to the next one. frames = [] continue if len(frames) != cfg.DATA.NUM_FRAMES * cfg.DATA.SAMPLING_RATE: frame_processed = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) frame_processed = scale(256, frame_processed) frames.append(frame_processed) if len(frames) == cfg.DATA.NUM_FRAMES * cfg.DATA.SAMPLING_RATE: start = time() # Perform color normalization. inputs = torch.tensor(frames).float() inputs = inputs / 255.0 inputs = inputs - torch.tensor(cfg.DATA.MEAN) inputs = inputs / torch.tensor(cfg.DATA.STD) # T H W C -> C T H W. inputs = inputs.permute(3, 0, 1, 2) # 1 C T H W. inputs = inputs[None, :, :, :, :] # Sample frames for the fast pathway. index = torch.linspace(0, inputs.shape[2] - 1, cfg.DATA.NUM_FRAMES).long() fast_pathway = torch.index_select(inputs, 2, index) logger.info('fast_pathway.shape={}'.format(fast_pathway.shape)) # Sample frames for the slow pathway. index = torch.linspace(0, fast_pathway.shape[2] - 1, fast_pathway.shape[2] // cfg.SLOWFAST.ALPHA).long() slow_pathway = torch.index_select(fast_pathway, 2, index) logger.info('slow_pathway.shape={}'.format(slow_pathway.shape)) inputs = [slow_pathway, fast_pathway] # Transfer the data to the current GPU device. if isinstance(inputs, (list, )): for i in range(len(inputs)): inputs[i] = inputs[i].cuda(non_blocking=True) else: inputs = inputs.cuda(non_blocking=True) # Perform the forward pass. preds = model(inputs) # Gather all the predictions across all the devices to perform ensemble. if cfg.NUM_GPUS > 1: preds = du.all_gather(preds)[0] ## Option 1: single label inference selected from the highest probability entry. # label_id = preds.argmax(-1).cpu() # pred_label = labels[label_id] # Option 2: multi-label inferencing selected from probability entries > threshold label_ids = torch.nonzero( preds.squeeze() > .1).reshape(-1).cpu().detach().numpy() pred_labels = labels[label_ids] logger.info(pred_labels) if not list(pred_labels): pred_labels = ['Unknown'] # remove the oldest frame in the buffer to make place for the new one. # frames.pop(0) frames = [] s = time() - start # #************************************************************ # # Option 1 # #************************************************************ # # Display prediction speed to frame # cv2.putText(frame, 'Speed: {:.2f}s'.format(s), (20, 30), # fontFace=cv2.FONT_HERSHEY_SIMPLEX, # fontScale=1, color=(0, 235, 0), thickness=3) # # Display predicted label to frame. # cv2.putText(frame, 'Action: {}'.format(pred_label), (20, 60), # fontFace=cv2.FONT_HERSHEY_SIMPLEX, # fontScale=1, color=(0, 235, 0), thickness=3) #************************************************************ # Option 2 #************************************************************ # Display prediction speed to frame cv2.putText(frame, 'Speed: {:.2f}s'.format(s), (20, 30), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1, color=(0, 235, 0), thickness=3) # Display predicted labels to frame. y_offset = 60 cv2.putText(frame, 'Action:', (20, y_offset), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1, color=(0, 235, 0), thickness=3) for pred_label in pred_labels: y_offset += 30 cv2.putText(frame, '{}'.format(pred_label), (20, y_offset), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1, color=(0, 235, 0), thickness=3) # Display the frame cv2.imshow('SlowFast', frame) # hit Esc to quit the demo. key = cv2.waitKey(1) if key == 27: break img_provider.clean()
def eval_epoch(val_loader, model, val_meter, cur_epoch, cfg, writer=None): """ Evaluate the model on the val set. Args: val_loader (loader): data loader to provide validation data. model (model): model to evaluate the performance. val_meter (ValMeter): meter instance to record and calculate the metrics. cur_epoch (int): number of the current epoch of training. cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py writer (TensorboardWriter, optional): TensorboardWriter object to writer Tensorboard log. """ # Evaluation mode enabled. The running stats would not be updated. model.eval() val_meter.iter_tic() for cur_iter, (inputs, labels, _, meta) in enumerate(val_loader): if cfg.NUM_GPUS: # Transferthe data to the current GPU device. if isinstance(inputs, (list, )): for i in range(len(inputs)): inputs[i] = inputs[i].cuda(non_blocking=True) else: inputs = inputs.cuda(non_blocking=True) labels = labels.cuda() for key, val in meta.items(): if isinstance(val, (list, )): for i in range(len(val)): val[i] = val[i].cuda(non_blocking=True) else: meta[key] = val.cuda(non_blocking=True) val_meter.data_toc() if cfg.DETECTION.ENABLE: # Compute the predictions. preds = model(inputs, meta["boxes"]) ori_boxes = meta["ori_boxes"] metadata = meta["metadata"] if cfg.NUM_GPUS: preds = preds.cpu() ori_boxes = ori_boxes.cpu() metadata = metadata.cpu() if cfg.NUM_GPUS > 1: preds = torch.cat(du.all_gather_unaligned(preds), dim=0) ori_boxes = torch.cat(du.all_gather_unaligned(ori_boxes), dim=0) metadata = torch.cat(du.all_gather_unaligned(metadata), dim=0) val_meter.iter_toc() # Update and log stats. val_meter.update_stats(preds, ori_boxes, metadata) else: preds = model(inputs) if cfg.DATA.MULTI_LABEL: if cfg.NUM_GPUS > 1: preds, labels = du.all_gather([preds, labels]) else: # Compute the errors. num_topks_correct = metrics.topks_correct( preds, labels, (1, 5)) # Combine the errors across the GPUs. top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0 for x in num_topks_correct] if cfg.NUM_GPUS > 1: top1_err, top5_err = du.all_reduce([top1_err, top5_err]) # Copy the errors from GPU to CPU (sync point). top1_err, top5_err = top1_err.item(), top5_err.item() val_meter.iter_toc() # Update and log stats. val_meter.update_stats( top1_err, top5_err, inputs[0].size(0) * max( cfg.NUM_GPUS, 1 ), # If running on CPU (cfg.NUM_GPUS == 1), use 1 to represent 1 CPU. ) # write to tensorboard format if available. if writer is not None: writer.add_scalars( { "Val/Top1_err": top1_err, "Val/Top5_err": top5_err }, global_step=len(val_loader) * cur_epoch + cur_iter, ) val_meter.update_predictions(preds, labels) val_meter.log_iter_stats(cur_epoch, cur_iter) val_meter.iter_tic() # Log epoch stats. val_meter.log_epoch_stats(cur_epoch) # write to tensorboard format if available. if writer is not None: if cfg.DETECTION.ENABLE: writer.add_scalars({"Val/mAP": val_meter.full_map}, global_step=cur_epoch) else: all_preds = [pred.clone().detach() for pred in val_meter.all_preds] all_labels = [ label.clone().detach() for label in val_meter.all_labels ] if cfg.NUM_GPUS: all_preds = [pred.cpu() for pred in all_preds] all_labels = [label.cpu() for label in all_labels] writer.plot_eval(preds=all_preds, labels=all_labels, global_step=cur_epoch) val_meter.reset()
def perform_test(test_loader, model, test_meter, cfg, writer=None): """ For classification: Perform mutli-view testing that uniformly samples N clips from an audio along its temporal axis. Softmax scores are averaged across all N views to form an audio-level prediction. All audio predictions are compared to ground-truth labels and the final testing performance is logged. Args: test_loader (loader): audio testing loader. model (model): the pretrained audio model to test. test_meter (TestMeter): testing meters to log and ensemble the testing results. cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py writer (TensorboardWriter object, optional): TensorboardWriter object to writer Tensorboard log. """ # Enable eval mode. model.eval() test_meter.iter_tic() for cur_iter, (inputs, labels, audio_idx, meta) in enumerate(test_loader): if cfg.NUM_GPUS: # Transfer the data to the current GPU device. if isinstance(inputs, (list,)): for i in range(len(inputs)): inputs[i] = inputs[i].cuda(non_blocking=True) else: inputs = inputs.cuda(non_blocking=True) # Transfer the data to the current GPU device. if isinstance(labels, (dict,)): labels = {k: v.cuda() for k, v in labels.items()} else: labels = labels.cuda() audio_idx = audio_idx.cuda() test_meter.data_toc() # Perform the forward pass. preds = model(inputs) if isinstance(labels, (dict,)): # Gather all the predictions across all the devices to perform ensemble. if cfg.NUM_GPUS > 1: verb_preds, verb_labels, audio_idx = du.all_gather( [preds[0], labels['verb'], audio_idx] ) noun_preds, noun_labels, audio_idx = du.all_gather( [preds[1], labels['noun'], audio_idx] ) meta = du.all_gather_unaligned(meta) metadata = {'narration_id': []} for i in range(len(meta)): metadata['narration_id'].extend(meta[i]['narration_id']) else: metadata = meta verb_preds, verb_labels, audio_idx = preds[0], labels['verb'], audio_idx noun_preds, noun_labels, audio_idx = preds[1], labels['noun'], audio_idx if cfg.NUM_GPUS: verb_preds = verb_preds.cpu() verb_labels = verb_labels.cpu() noun_preds = noun_preds.cpu() noun_labels = noun_labels.cpu() audio_idx = audio_idx.cpu() test_meter.iter_toc() # Update and log stats. test_meter.update_stats( (verb_preds.detach(), noun_preds.detach()), (verb_labels.detach(), noun_labels.detach()), metadata, audio_idx.detach(), ) test_meter.log_iter_stats(cur_iter) else: # Gather all the predictions across all the devices to perform ensemble. if cfg.NUM_GPUS > 1: preds, labels, audio_idx = du.all_gather( [preds, labels, audio_idx] ) if cfg.NUM_GPUS: preds = preds.cpu() labels = labels.cpu() audio_idx = audio_idx.cpu() test_meter.iter_toc() # Update and log stats. test_meter.update_stats( preds.detach(), labels.detach(), audio_idx.detach() ) test_meter.log_iter_stats(cur_iter) test_meter.iter_tic() # Log epoch stats and print the final testing results. if cfg.TEST.DATASET != 'epickitchens': all_preds = test_meter.audio_preds.clone().detach() all_labels = test_meter.audio_labels if cfg.NUM_GPUS: all_preds = all_preds.cpu() all_labels = all_labels.cpu() if writer is not None: writer.plot_eval(preds=all_preds, labels=all_labels) if cfg.TEST.SAVE_RESULTS_PATH != "": save_path = os.path.join(cfg.OUTPUT_DIR, cfg.TEST.SAVE_RESULTS_PATH) if du.is_root_proc(): with PathManager.open(save_path, "wb") as f: pickle.dump([all_preds, all_labels], f) logger.info( "Successfully saved prediction results to {}".format(save_path) ) preds, preds_clips, labels, metadata = test_meter.finalize_metrics() return test_meter, preds, preds_clips, labels, metadata
def perform_test(test_loader, model, test_meter, cfg, writer=None): """ For classification: Perform mutli-view testing that uniformly samples N clips from a video along its temporal axis. For each clip, it takes 3 crops to cover the spatial dimension, followed by averaging the softmax scores across all Nx3 views to form a video-level prediction. All video predictions are compared to ground-truth labels and the final testing performance is logged. For detection: Perform fully-convolutional testing on the full frames without crop. Args: test_loader (loader): video testing loader. model (model): the pretrained video model to test. test_meter (TestMeter): testing meters to log and ensemble the testing results. cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py writer (TensorboardWriter object, optional): TensorboardWriter object to writer Tensorboard log. """ # Enable eval mode. model.eval() test_meter.iter_tic() for cur_iter, (inputs, labels, video_idx, time, meta) in enumerate(test_loader): if cfg.NUM_GPUS: # Transfer the data to the current GPU device. if isinstance(inputs, (list, )): for i in range(len(inputs)): inputs[i] = inputs[i].cuda(non_blocking=True) else: inputs = inputs.cuda(non_blocking=True) # Transfer the data to the current GPU device. labels = labels.cuda() video_idx = video_idx.cuda() for key, val in meta.items(): if isinstance(val, (list, )): for i in range(len(val)): val[i] = val[i].cuda(non_blocking=True) else: meta[key] = val.cuda(non_blocking=True) test_meter.data_toc() if cfg.DETECTION.ENABLE: # Compute the predictions. preds = model(inputs, meta["boxes"]) ori_boxes = meta["ori_boxes"] metadata = meta["metadata"] preds = preds.detach().cpu() if cfg.NUM_GPUS else preds.detach() ori_boxes = (ori_boxes.detach().cpu() if cfg.NUM_GPUS else ori_boxes.detach()) metadata = (metadata.detach().cpu() if cfg.NUM_GPUS else metadata.detach()) if cfg.NUM_GPUS > 1: preds = torch.cat(du.all_gather_unaligned(preds), dim=0) ori_boxes = torch.cat(du.all_gather_unaligned(ori_boxes), dim=0) metadata = torch.cat(du.all_gather_unaligned(metadata), dim=0) test_meter.iter_toc() # Update and log stats. test_meter.update_stats(preds, ori_boxes, metadata) test_meter.log_iter_stats(None, cur_iter) elif cfg.TASK == "ssl" and cfg.MODEL.MODEL_NAME == "ContrastiveModel": if not cfg.CONTRASTIVE.KNN_ON: test_meter.finalize_metrics() return test_meter # preds = model(inputs, video_idx, time) train_labels = (model.module.train_labels if hasattr( model, "module") else model.train_labels) yd, yi = model(inputs, video_idx, time) batchSize = yi.shape[0] K = yi.shape[1] C = cfg.CONTRASTIVE.NUM_CLASSES_DOWNSTREAM # eg 400 for Kinetics400 candidates = train_labels.view(1, -1).expand(batchSize, -1) retrieval = torch.gather(candidates, 1, yi) retrieval_one_hot = torch.zeros((batchSize * K, C)).cuda() retrieval_one_hot.scatter_(1, retrieval.view(-1, 1), 1) yd_transform = yd.clone().div_(cfg.CONTRASTIVE.T).exp_() probs = torch.mul( retrieval_one_hot.view(batchSize, -1, C), yd_transform.view(batchSize, -1, 1), ) preds = torch.sum(probs, 1) else: # Perform the forward pass. preds = model(inputs) # Gather all the predictions across all the devices to perform ensemble. if cfg.NUM_GPUS > 1: preds, labels, video_idx = du.all_gather( [preds, labels, video_idx]) if cfg.NUM_GPUS: preds = preds.cpu() labels = labels.cpu() video_idx = video_idx.cpu() test_meter.iter_toc() # Update and log stats. test_meter.update_stats(preds.detach(), labels.detach(), video_idx.detach()) test_meter.log_iter_stats(cur_iter) test_meter.iter_tic() # Log epoch stats and print the final testing results. if not cfg.DETECTION.ENABLE: all_preds = test_meter.video_preds.clone().detach() all_labels = test_meter.video_labels if cfg.NUM_GPUS: all_preds = all_preds.cpu() all_labels = all_labels.cpu() if writer is not None: writer.plot_eval(preds=all_preds, labels=all_labels) if cfg.TEST.SAVE_RESULTS_PATH != "": save_path = os.path.join(cfg.OUTPUT_DIR, cfg.TEST.SAVE_RESULTS_PATH) if du.is_root_proc(): with pathmgr.open(save_path, "wb") as f: pickle.dump([all_preds, all_labels], f) logger.info("Successfully saved prediction results to {}".format( save_path)) test_meter.finalize_metrics() return test_meter
def eval_epoch(val_loader, model, val_meter, cur_epoch, cfg, writer=None, wandb_log=False): """ Evaluate the model on the val set. Args: val_loader (loader): data loader to provide validation data. model (model): model to evaluate the performance. val_meter (ValMeter): meter instance to record and calculate the metrics. cur_epoch (int): number of the current epoch of training. cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py writer (TensorboardWriter, optional): TensorboardWriter object to writer Tensorboard log. """ # Evaluation mode enabled. The running stats would not be updated. model.eval() val_meter.iter_tic() for cur_iter, (inputs, labels, _, meta) in enumerate(val_loader): if cfg.NUM_GPUS: # Transferthe data to the current GPU device. if isinstance(inputs, (list, )): for i in range(len(inputs)): inputs[i] = inputs[i].cuda(non_blocking=True) else: inputs = inputs.cuda(non_blocking=True) if isinstance(labels, (dict, )): labels = {k: v.cuda() for k, v in labels.items()} else: labels = labels.cuda() for key, val in meta.items(): if isinstance(val, (list, )): for i in range(len(val)): val[i] = val[i].cuda(non_blocking=True) else: meta[key] = val.cuda(non_blocking=True) val_meter.data_toc() preds = model(inputs) if isinstance(labels, (dict, )): # Explicitly declare reduction to mean. loss_fun = losses.get_loss_func( cfg.MODEL.LOSS_FUNC)(reduction="mean") # Compute the loss. loss_verb = loss_fun(preds[0], labels['verb']) loss_noun = loss_fun(preds[1], labels['noun']) loss = 0.5 * (loss_verb + loss_noun) # Compute the verb accuracies. verb_top1_acc, verb_top5_acc = metrics.topk_accuracies( preds[0], labels['verb'], (1, 5)) # Combine the errors across the GPUs. if cfg.NUM_GPUS > 1: loss_verb, verb_top1_acc, verb_top5_acc = du.all_reduce( [loss_verb, verb_top1_acc, verb_top5_acc]) # Copy the errors from GPU to CPU (sync point). loss_verb, verb_top1_acc, verb_top5_acc = ( loss_verb.item(), verb_top1_acc.item(), verb_top5_acc.item(), ) # Compute the noun accuracies. noun_top1_acc, noun_top5_acc = metrics.topk_accuracies( preds[1], labels['noun'], (1, 5)) # Combine the errors across the GPUs. if cfg.NUM_GPUS > 1: loss_noun, noun_top1_acc, noun_top5_acc = du.all_reduce( [loss_noun, noun_top1_acc, noun_top5_acc]) # Copy the errors from GPU to CPU (sync point). loss_noun, noun_top1_acc, noun_top5_acc = ( loss_noun.item(), noun_top1_acc.item(), noun_top5_acc.item(), ) # Compute the action accuracies. action_top1_acc, action_top5_acc = metrics.multitask_topk_accuracies( (preds[0], preds[1]), (labels['verb'], labels['noun']), (1, 5)) # Combine the errors across the GPUs. if cfg.NUM_GPUS > 1: loss, action_top1_acc, action_top5_acc = du.all_reduce( [loss, action_top1_acc, action_top5_acc]) # Copy the errors from GPU to CPU (sync point). loss, action_top1_acc, action_top5_acc = ( loss.item(), action_top1_acc.item(), action_top5_acc.item(), ) val_meter.iter_toc() # Update and log stats. val_meter.update_stats( (verb_top1_acc, noun_top1_acc, action_top1_acc), (verb_top5_acc, noun_top5_acc, action_top5_acc), inputs[0].size(0) * max( cfg.NUM_GPUS, 1 ), # If running on CPU (cfg.NUM_GPUS == 1), use 1 to represent 1 CPU. ) # write to tensorboard format if available. if writer is not None and not wandb_log: writer.add_scalars( { "Val/loss": loss, "Val/Top1_acc": action_top1_acc, "Val/Top5_acc": action_top5_acc, "Val/verb/loss": loss_verb, "Val/verb/Top1_acc": verb_top1_acc, "Val/verb/Top5_acc": verb_top5_acc, "Val/noun/loss": loss_noun, "Val/noun/Top1_acc": noun_top1_acc, "Val/noun/Top5_acc": noun_top5_acc, }, global_step=len(val_loader) * cur_epoch + cur_iter, ) if wandb_log: wandb.log( { "Val/loss": loss, "Val/Top1_acc": action_top1_acc, "Val/Top5_acc": action_top5_acc, "Val/verb/loss": loss_verb, "Val/verb/Top1_acc": verb_top1_acc, "Val/verb/Top5_acc": verb_top5_acc, "Val/noun/loss": loss_noun, "Val/noun/Top1_acc": noun_top1_acc, "Val/noun/Top5_acc": noun_top5_acc, "val_step": len(val_loader) * cur_epoch + cur_iter, }, ) val_meter.update_predictions((preds[0], preds[1]), (labels['verb'], labels['noun'])) else: # Explicitly declare reduction to mean. loss_fun = losses.get_loss_func( cfg.MODEL.LOSS_FUNC)(reduction="mean") # Compute the loss. loss = loss_fun(preds, labels) if cfg.DATA.MULTI_LABEL: if cfg.NUM_GPUS > 1: preds, labels = du.all_gather([preds, labels]) else: # Compute the errors. num_topks_correct = metrics.topks_correct( preds, labels, (1, 5)) # Combine the errors across the GPUs. top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0 for x in num_topks_correct] if cfg.NUM_GPUS > 1: loss, top1_err, top5_err = du.all_reduce( [loss, top1_err, top5_err]) # Copy the errors from GPU to CPU (sync point). loss, top1_err, top5_err = ( loss.item(), top1_err.item(), top5_err.item(), ) val_meter.iter_toc() # Update and log stats. val_meter.update_stats( top1_err, top5_err, inputs[0].size(0) * max( cfg.NUM_GPUS, 1 ), # If running on CPU (cfg.NUM_GPUS == 1), use 1 to represent 1 CPU. ) # write to tensorboard format if available. if writer is not None and not wandb_log: writer.add_scalars( { "Val/loss": loss, "Val/Top1_err": top1_err, "Val/Top5_err": top5_err, }, global_step=len(val_loader) * cur_epoch + cur_iter, ) if wandb_log: wandb.log( { "Val/loss": loss, "Val/Top1_err": top1_err, "Val/Top5_err": top5_err, "val_step": len(val_loader) * cur_epoch + cur_iter, }, ) val_meter.update_predictions(preds, labels) val_meter.log_iter_stats(cur_epoch, cur_iter) val_meter.iter_tic() # Log epoch stats. is_best_epoch, top1_dict = val_meter.log_epoch_stats(cur_epoch) # write to tensorboard format if available. if writer is not None: all_preds = [pred.clone().detach() for pred in val_meter.all_preds] all_labels = [label.clone().detach() for label in val_meter.all_labels] if cfg.NUM_GPUS: all_preds = [pred.cpu() for pred in all_preds] all_labels = [label.cpu() for label in all_labels] writer.plot_eval(preds=all_preds, labels=all_labels, global_step=cur_epoch) if writer is not None and not wandb_log: if "top1_acc" in top1_dict.keys(): writer.add_scalars( { "Val/epoch/Top1_acc": top1_dict["top1_acc"], "Val/epoch/verb/Top1_acc": top1_dict["verb_top1_acc"], "Val/epoch/noun/Top1_acc": top1_dict["noun_top1_acc"], }, global_step=cur_epoch, ) else: writer.add_scalars( {"Val/epoch/Top1_err": top1_dict["top1_err"]}, global_step=cur_epoch, ) if wandb_log: if "top1_acc" in top1_dict.keys(): wandb.log( { "Val/epoch/Top1_acc": top1_dict["top1_acc"], "Val/epoch/verb/Top1_acc": top1_dict["verb_top1_acc"], "Val/epoch/noun/Top1_acc": top1_dict["noun_top1_acc"], "epoch": cur_epoch, }, ) else: wandb.log({ "Val/epoch/Top1_err": top1_dict["top1_err"], "epoch": cur_epoch }) top1 = top1_dict["top1_acc"] if "top1_acc" in top1_dict.keys( ) else top1_dict["top1_err"] val_meter.reset() return is_best_epoch, top1
def perform_test(test_loader, model, test_meter, cfg, writer=None, device='cpu'): """ For classification: Perform mutli-view testing that uniformly samples N clips from a video along its temporal axis. For each clip, it takes 3 crops to cover the spatial dimension, followed by averaging the softmax scores across all Nx3 views to form a video-level prediction. All video predictions are compared to ground-truth labels and the final testing performance is logged. For detection: Perform fully-convolutional testing on the full frames without crop. Args: test_loader (loader): video testing loader. model (model): the pretrained video model to test. test_meter (TestMeter): testing meters to log and ensemble the testing results. cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py writer (TensorboardWriter object, optional): TensorboardWriter object to writer Tensorboard log. """ # Enable eval mode. import time model.eval() test_meter.iter_tic() print('The len of dataloader: ', len(test_loader)) ntic = time.time() for cur_iter, (inputs, labels, video_idx, meta) in enumerate(test_loader): print(time.time() - ntic) print('in dataloader - input shape is: ', len(inputs)) print(inputs[0].shape, inputs[1].shape) ntic = time.time() if cfg.NUM_GPUS: # Transfer the data to the current GPU device. if isinstance(inputs, (list, )): for i in range(len(inputs)): inputs[i] = inputs[i].to(device, non_blocking=True) else: inputs = inputs.cuda(non_blocking=True) # Transfer the data to the current GPU device. labels = labels.to(device) video_idx = video_idx.to(device) for key, val in meta.items(): if isinstance(val, (list, )): for i in range(len(val)): val[i] = val[i].cuda(non_blocking=True) else: meta[key] = val.cuda(non_blocking=True) print('transfer to gpu: ', time.time() - ntic) ntic = time.time() if cfg.DETECTION.ENABLE: # Compute the predictions. preds = model(inputs, meta["boxes"]) ori_boxes = meta["ori_boxes"].cpu() metadata = meta["metadata"].cpu() if cfg.NUM_GPUS > 1: preds = torch.cat(du.all_gather_unaligned(preds), dim=0) ori_boxes = torch.cat(du.all_gather_unaligned(ori_boxes), dim=0) metadata = torch.cat(du.all_gather_unaligned(metadata), dim=0) preds = preds.detach().cpu() if cfg.NUM_GPUS else preds.detach() ori_boxes = (ori_boxes.detach().cpu() if cfg.NUM_GPUS else ori_boxes.detach()) metadata = (metadata.detach().cpu() if cfg.NUM_GPUS else metadata.detach()) test_meter.iter_toc() # Update and log stats. test_meter.update_stats(preds, ori_boxes, metadata) test_meter.log_iter_stats(None, cur_iter) else: # Perform the forward pass. import time ntic_1 = time.time() with torch.no_grad(): preds, pre_gap, gap = model(inputs) print('after fwd pass: '******'full test done: ', time.time() - ntic) ntic = time.time() # Log epoch stats and print the final testing results. if writer is not None and not cfg.DETECTION.ENABLE: all_preds = [pred.clone().detach() for pred in test_meter.video_preds] all_labels = [ label.clone().detach() for label in test_meter.video_labels ] if cfg.NUM_GPUS: all_preds = [pred.cpu() for pred in all_preds] all_labels = [label.cpu() for label in all_labels] writer.plot_eval(preds=all_preds, labels=all_labels) test_meter.finalize_metrics() test_meter.reset() print('full func done: ', time.time() - ntic) return preds, gap
def eval_epoch(val_loader, model, val_meter, cur_epoch, cfg, train_loader, writer): """ Evaluate the model on the val set. Args: val_loader (loader): data loader to provide validation data. model (model): model to evaluate the performance. val_meter (ValMeter): meter instance to record and calculate the metrics. cur_epoch (int): number of the current epoch of training. cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py writer (TensorboardWriter, optional): TensorboardWriter object to writer Tensorboard log. """ # Evaluation mode enabled. The running stats would not be updated. model.eval() val_meter.iter_tic() for cur_iter, (inputs, labels, index, time, meta) in enumerate(val_loader): if cfg.NUM_GPUS: # Transferthe data to the current GPU device. if isinstance(inputs, (list, )): for i in range(len(inputs)): inputs[i] = inputs[i].cuda(non_blocking=True) else: inputs = inputs.cuda(non_blocking=True) labels = labels.cuda() for key, val in meta.items(): if isinstance(val, (list, )): for i in range(len(val)): val[i] = val[i].cuda(non_blocking=True) else: meta[key] = val.cuda(non_blocking=True) index = index.cuda() time = time.cuda() batch_size = (inputs[0][0].size(0) if isinstance(inputs[0], list) else inputs[0].size(0)) val_meter.data_toc() if cfg.DETECTION.ENABLE: # Compute the predictions. preds = model(inputs, meta["boxes"]) ori_boxes = meta["ori_boxes"] metadata = meta["metadata"] if cfg.NUM_GPUS: preds = preds.cpu() ori_boxes = ori_boxes.cpu() metadata = metadata.cpu() if cfg.NUM_GPUS > 1: preds = torch.cat(du.all_gather_unaligned(preds), dim=0) ori_boxes = torch.cat(du.all_gather_unaligned(ori_boxes), dim=0) metadata = torch.cat(du.all_gather_unaligned(metadata), dim=0) val_meter.iter_toc() # Update and log stats. val_meter.update_stats(preds, ori_boxes, metadata) else: if cfg.TASK == "ssl" and cfg.MODEL.MODEL_NAME == "ContrastiveModel": if not cfg.CONTRASTIVE.KNN_ON: return train_labels = (model.module.train_labels if hasattr( model, "module") else model.train_labels) yd, yi = model(inputs, index, time) K = yi.shape[1] C = cfg.CONTRASTIVE.NUM_CLASSES_DOWNSTREAM # eg 400 for Kinetics400 candidates = train_labels.view(1, -1).expand(batch_size, -1) retrieval = torch.gather(candidates, 1, yi) retrieval_one_hot = torch.zeros((batch_size * K, C)).cuda() retrieval_one_hot.scatter_(1, retrieval.view(-1, 1), 1) yd_transform = yd.clone().div_(cfg.CONTRASTIVE.T).exp_() probs = torch.mul( retrieval_one_hot.view(batch_size, -1, C), yd_transform.view(batch_size, -1, 1), ) preds = torch.sum(probs, 1) else: preds = model(inputs) if cfg.DATA.MULTI_LABEL: if cfg.NUM_GPUS > 1: preds, labels = du.all_gather([preds, labels]) else: # Compute the errors. num_topks_correct = metrics.topks_correct( preds, labels, (1, 5)) # Combine the errors across the GPUs. top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0 for x in num_topks_correct] if cfg.NUM_GPUS > 1: top1_err, top5_err = du.all_reduce([top1_err, top5_err]) # Copy the errors from GPU to CPU (sync point). top1_err, top5_err = top1_err.item(), top5_err.item() val_meter.iter_toc() # Update and log stats. val_meter.update_stats( top1_err, top5_err, batch_size * max( cfg.NUM_GPUS, 1 ), # If running on CPU (cfg.NUM_GPUS == 1), use 1 to represent 1 CPU. ) # write to tensorboard format if available. if writer is not None: writer.add_scalars( { "Val/Top1_err": top1_err, "Val/Top5_err": top5_err }, global_step=len(val_loader) * cur_epoch + cur_iter, ) val_meter.update_predictions(preds, labels) val_meter.log_iter_stats(cur_epoch, cur_iter) val_meter.iter_tic() # Log epoch stats. val_meter.log_epoch_stats(cur_epoch) # write to tensorboard format if available. if writer is not None: if cfg.DETECTION.ENABLE: writer.add_scalars({"Val/mAP": val_meter.full_map}, global_step=cur_epoch) else: all_preds = [pred.clone().detach() for pred in val_meter.all_preds] all_labels = [ label.clone().detach() for label in val_meter.all_labels ] if cfg.NUM_GPUS: all_preds = [pred.cpu() for pred in all_preds] all_labels = [label.cpu() for label in all_labels] writer.plot_eval(preds=all_preds, labels=all_labels, global_step=cur_epoch) val_meter.reset()
def perform_test(test_loader, model, test_meter, cfg, writer=None): """ For classification: Perform mutli-view testing that uniformly samples N clips from a video along its temporal axis. For each clip, it takes 3 crops to cover the spatial dimension, followed by averaging the softmax scores across all Nx3 views to form a video-level prediction. All video predictions are compared to ground-truth labels and the final testing performance is logged. Args: test_loader (loader): video testing loader. model (model): the pretrained video model to test. test_meter (TestMeter): testing meters to log and ensemble the testing results. cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py writer (TensorboardWriter object, optional): TensorboardWriter object to writer Tensorboard log. """ # Enable eval mode. model.eval() test_meter.iter_tic() for cur_iter, (inputs, labels, video_idx, meta) in enumerate(test_loader): # Transfer the data to the current GPU device. if isinstance(inputs, (list, )): for i in range(len(inputs)): inputs[i] = inputs[i].cuda(non_blocking=True) else: inputs = inputs.cuda(non_blocking=True) # Transfer the data to the current GPU device. labels = labels.cuda() video_idx = video_idx.cuda() for key, val in meta.items(): if isinstance(val, (list, )): for i in range(len(val)): val[i] = val[i].cuda(non_blocking=True) else: meta[key] = val.cuda(non_blocking=True) # Perform the forward pass. preds = model(inputs) # Gather all the predictions across all the devices to perform ensemble. if cfg.NUM_GPUS > 1: preds, labels, video_idx = du.all_gather( [preds, labels, video_idx]) test_meter.iter_toc() # Update and log stats. test_meter.update_stats( preds.detach().cpu(), labels.detach().cpu(), video_idx.detach().cpu(), ) test_meter.log_iter_stats(cur_iter) test_meter.iter_tic() # Log epoch stats and print the final testing results. if writer is not None: all_preds_cpu = [ pred.clone().detach().cpu() for pred in test_meter.video_preds ] all_labels_cpu = [ label.clone().detach().cpu() for label in test_meter.video_labels ] writer.plot_eval(preds=all_preds_cpu, labels=all_labels_cpu) test_meter.finalize_metrics() test_meter.reset()
def demo(cfg): # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging() # Print config. logger.info("Run demo with config:") logger.info(cfg) # Build the video model and print model statistics. model = model_builder.build_model(cfg) model.eval() misc.log_model_info(model) # Load a checkpoint to test if applicable. if cfg.TEST.CHECKPOINT_FILE_PATH != "": ckpt = cfg.TEST.CHECKPOINT_FILE_PATH elif cu.has_checkpoint(cfg.OUTPUT_DIR): ckpt = cu.get_last_checkpoint(cfg.OUTPUT_DIR) elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "": # If no checkpoint found in TEST.CHECKPOINT_FILE_PATH or in the current # checkpoint folder, try to load checkpoint from # TRAIN.CHECKPOINT_FILE_PATH and test it. ckpt = cfg.TRAIN.CHECKPOINT_FILE_PATH else: raise NotImplementedError("Unknown way to load checkpoint.") cu.load_checkpoint( ckpt, model, cfg.NUM_GPUS > 1, None, inflation=False, convert_from_caffe2= "caffe2" in [cfg.TEST.CHECKPOINT_TYPE, cfg.TRAIN.CHECKPOINT_TYPE], ) if cfg.DETECTION.ENABLE: # Load object detector from detectron2 dtron2_cfg_file = cfg.DEMO.DETECTRON2_OBJECT_DETECTION_MODEL_CFG dtron2_cfg = get_cfg() dtron2_cfg.merge_from_file(model_zoo.get_config_file(dtron2_cfg_file)) dtron2_cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = .5 dtron2_cfg.MODEL.WEIGHTS = cfg.DEMO.DETECTRON2_OBJECT_DETECTION_MODEL_WEIGHTS object_predictor = DefaultPredictor(dtron2_cfg) # Load the labels of AVA dataset with open(cfg.DEMO.LABEL_FILE_PATH) as f: labels = f.read().split('\n')[:-1] palette = np.random.randint(64, 128, (len(labels), 3)).tolist() boxes = [] else: # Load the labels of Kinectics-400 dataset labels_df = pd.read_csv(cfg.DEMO.LABEL_FILE_PATH) labels = labels_df['name'].values count_xxx = 0 # frame_provider = VideoReader(cfg) seq_len = cfg.DATA.NUM_FRAMES*cfg.DATA.SAMPLING_RATE frames = [] org_frames = [] mid_frame = None pred_labels = [] cap = cv2.VideoCapture(cfg.DEMO.DATA_SOURCE) was_read, frame = cap.read() display_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) display_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = cap.get(cv2.CAP_PROP_FPS) fourcc = cv2.VideoWriter_fourcc(*'mp4v') videowriter = cv2.VideoWriter('test.mp4',fourcc, fps, (display_width,display_height)) while was_read : was_read, frame = cap.read() if not was_read: videowriter.release() if len(frames) != seq_len: frame_processed = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) frame_processed = scale(cfg.DATA.TEST_CROP_SIZE, frame_processed) frames.append(frame_processed) org_frames.append(frame) else: print(count_xxx) count_xxx += 1 start = time() # if cfg.DETECTION.ENABLE and len(frames) == seq_len//2 - 1: mid_frame = org_frames[seq_len//2 - 2] draw_imgs = [] for idx in range(seq_len//2 - 1): img = org_frames[idx] outputs = object_predictor(img) fields = outputs["instances"]._fields pred_classes = fields["pred_classes"] selection_mask = pred_classes == 0 # acquire person boxes pred_classes = pred_classes[selection_mask] pred_boxes = fields["pred_boxes"].tensor[selection_mask] scores = fields["scores"][selection_mask] boxes = cv2_transform.scale_boxes(cfg.DATA.TEST_CROP_SIZE, pred_boxes, display_height, display_width) boxes = torch.cat([torch.full((boxes.shape[0], 1), float(0)).cuda(), boxes], axis=1) boxes = boxes.cpu().detach().numpy() ratio = np.min( [display_height, display_width] ) / cfg.DATA.TEST_CROP_SIZE boxes = boxes[:, 1:] * ratio for box in boxes: xmin, ymin, xmax, ymax = box cv2.rectangle(img, (xmin, ymin), (xmax , ymax), (0, 255, 0), thickness=2) draw_imgs.append(img) start = time() if cfg.DETECTION.ENABLE: outputs = object_predictor(mid_frame) fields = outputs["instances"]._fields pred_classes = fields["pred_classes"] selection_mask = pred_classes == 0 # acquire person boxes pred_classes = pred_classes[selection_mask] pred_boxes = fields["pred_boxes"].tensor[selection_mask] scores = fields["scores"][selection_mask] boxes = cv2_transform.scale_boxes(cfg.DATA.TEST_CROP_SIZE, pred_boxes, display_height, display_width) boxes = torch.cat([torch.full((boxes.shape[0], 1), float(0)).cuda(), boxes], axis=1) inputs = torch.as_tensor(frames).float() inputs = inputs / 255.0 # Perform color normalization. inputs = inputs - torch.tensor(cfg.DATA.MEAN) inputs = inputs / torch.tensor(cfg.DATA.STD) # T H W C -> C T H W. inputs = inputs.permute(3, 0, 1, 2) # 1 C T H W. inputs = inputs.unsqueeze(0) # Sample frames for the fast pathway. index = torch.linspace(0, inputs.shape[2] - 1, cfg.DATA.NUM_FRAMES).long() fast_pathway = torch.index_select(inputs, 2, index) logger.info('fast_pathway.shape={}'.format(fast_pathway.shape)) # Sample frames for the slow pathway. index = torch.linspace(0, fast_pathway.shape[2] - 1, fast_pathway.shape[2]//cfg.SLOWFAST.ALPHA).long() slow_pathway = torch.index_select(fast_pathway, 2, index) logger.info('slow_pathway.shape={}'.format(slow_pathway.shape)) inputs = [slow_pathway, fast_pathway] # Transfer the data to the current GPU device. if isinstance(inputs, (list,)): for i in range(len(inputs)): inputs[i] = inputs[i].cuda(non_blocking=True) else: inputs = inputs.cuda(non_blocking=True) # Perform the forward pass. if cfg.DETECTION.ENABLE: # When there is nothing in the scene, # use a dummy variable to disable all computations below. if not len(boxes): preds = torch.tensor([]) else: preds = model(inputs, boxes) else: preds = model(inputs) # Gather all the predictions across all the devices to perform ensemble. if cfg.NUM_GPUS > 1: preds = du.all_gather(preds)[0] if cfg.DETECTION.ENABLE: # This post processing was intendedly assigned to the cpu since my laptop GPU # RTX 2080 runs out of its memory, if your GPU is more powerful, I'd recommend # to change this section to make CUDA does the processing. preds = preds.cpu().detach().numpy() pred_masks = preds > .1 label_ids = [np.nonzero(pred_mask)[0] for pred_mask in pred_masks] pred_labels = [ [labels[label_id] for label_id in perbox_label_ids] for perbox_label_ids in label_ids ] # I'm unsure how to detectron2 rescales boxes to image original size, so I use # input boxes of slowfast and rescale back it instead, it's safer and even if boxes # was not rescaled by cv2_transform.rescale_boxes, it still works. boxes = boxes.cpu().detach().numpy() ratio = np.min( [display_height, display_width] ) / cfg.DATA.TEST_CROP_SIZE boxes = boxes[:, 1:] * ratio else: ## Option 1: single label inference selected from the highest probability entry. # label_id = preds.argmax(-1).cpu() # pred_label = labels[label_id] # Option 2: multi-label inferencing selected from probability entries > threshold label_ids = torch.nonzero(preds.squeeze() > .1).reshape(-1).cpu().detach().numpy() pred_labels = labels[label_ids] logger.info(pred_labels) if not list(pred_labels): pred_labels = ['Unknown'] # # option 1: remove the oldest frame in the buffer to make place for the new one. # frames.pop(0) # option 2: empty the buffer s = (time() - start) / len(frames) if cfg.DETECTION.ENABLE and pred_labels and boxes.any(): for box, box_labels in zip(boxes.astype(int), pred_labels): xmin, ymin, xmax, ymax = box cv2.rectangle(mid_frame, (xmin, ymin), (xmax , ymax), (0, 255, 0), thickness=2) label_origin = box[:2] for label in box_labels: label_origin[-1] -= 5 (label_width, label_height), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, .5, 2) cv2.rectangle( mid_frame, (label_origin[0], label_origin[1] + 5), (label_origin[0] + label_width, label_origin[1] - label_height - 5), palette[labels.index(label)], -1 ) cv2.putText( mid_frame, label, tuple(label_origin), cv2.FONT_HERSHEY_SIMPLEX, .5, (255, 255, 255), 1 ) label_origin[-1] -= label_height + 5 draw_imgs.append(mid_frame) for img_ in draw_imgs: videowriter.write(img_) # cv2.imwrite('out_image/test_%04d_%04d.jpg' % (count_xxx, i) ,frame ) frames = frames[seq_len//2 - 1:] org_frames = org_frames[seq_len//2 - 1:] # if count_xxx == 2: # videowriter.release() # exit()
def eval_epoch(val_loader, model, val_meter, cur_epoch, cfg): """ Evaluate the model on the val set. Args: val_loader (loader): data loader to provide validation data. model (model): model to evaluate the performance. val_meter (ValMeter): meter instance to record and calculate the metrics. cur_epoch (int): number of the current epoch of training. cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Evaluation mode enabled. The running stats would not be updated. model.eval() val_meter.iter_tic() for cur_iter, (inputs, labels, _, meta) in enumerate(val_loader): # Transferthe data to the current GPU device. if isinstance(inputs, (list, )): for i in range(len(inputs)): inputs[i] = inputs[i].cuda(non_blocking=True) else: inputs = inputs.cuda(non_blocking=True) labels = labels.cuda() for key, val in meta.items(): if isinstance(val, (list, )): for i in range(len(val)): val[i] = val[i].cuda(non_blocking=True) else: meta[key] = val.cuda(non_blocking=True) if cfg.DETECTION.ENABLE: # Compute the predictions. preds = model(inputs, meta["boxes"]) preds = preds.cpu() ori_boxes = meta["ori_boxes"].cpu() metadata = meta["metadata"].cpu() if cfg.NUM_GPUS > 1: preds = torch.cat(du.all_gather_unaligned(preds), dim=0) ori_boxes = torch.cat(du.all_gather_unaligned(ori_boxes), dim=0) metadata = torch.cat(du.all_gather_unaligned(metadata), dim=0) val_meter.iter_toc() # Update and log stats. val_meter.update_stats(preds.cpu(), ori_boxes.cpu(), metadata.cpu()) else: preds = model(inputs) if cfg.DATA.MULTI_LABEL: if cfg.NUM_GPUS > 1: preds, labels = du.all_gather([preds, labels]) val_meter.update_predictions(preds, labels) else: # Compute the errors. num_topks_correct = metrics.topks_correct( preds, labels, (1, 5)) # Combine the errors across the GPUs. top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0 for x in num_topks_correct] if cfg.NUM_GPUS > 1: top1_err, top5_err = du.all_reduce([top1_err, top5_err]) # Copy the errors from GPU to CPU (sync point). top1_err, top5_err = top1_err.item(), top5_err.item() val_meter.iter_toc() # Update and log stats. val_meter.update_stats(top1_err, top5_err, inputs[0].size(0) * cfg.NUM_GPUS) val_meter.log_iter_stats(cur_epoch, cur_iter) val_meter.iter_tic() # Log epoch stats. val_meter.log_epoch_stats(cur_epoch) val_meter.reset()
def eval_epoch(self, val_loader, model, val_meter, cur_epoch, cfg, writer=None): """ Evaluate the model on the val set. Args: val_loader (loader): data loader to provide validation data. model (model): model to evaluate the performance. val_meter (ValMeter): meter instance to record and calculate the metrics. cur_epoch (int): number of the current epoch of training. cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py writer (TensorboardWriter, optional): TensorboardWriter object to writer Tensorboard log. """ # Evaluation mode enabled. The running stats would not be updated. model.eval() data_size = len(val_loader) btch = cfg.TRAIN.BATCH_SIZE * self.cfg.NUM_SHARDS rankE = os.environ.get("RANK", None) worldE = os.environ.get("WORLD_SIZE", None) dSize = data_size * btch self.logger.info( "Val Epoch {} dLen {} Batch {} dSize {} localRank {} rank {} {} world {} {}" .format(cur_epoch, data_size, btch, dSize, du.get_local_rank(), du.get_rank(), rankE, du.get_world_size(), worldE)) val_meter.iter_tic() predsAll = [] labelsAll = [] data_size = len(val_loader) for cur_iter, (inputs, labels, _, meta) in enumerate(val_loader): # Transferthe data to the current GPU device. if isinstance(inputs, (list, )): for i in range(len(inputs)): inputs[i] = inputs[i].cuda(non_blocking=True) else: inputs = inputs.cuda(non_blocking=True) labels = labels.cuda() for key, val in meta.items(): if isinstance(val, (list, )): for i in range(len(val)): val[i] = val[i].cuda(non_blocking=True) else: meta[key] = val.cuda(non_blocking=True) if cfg.DETECTION.ENABLE: # Compute the predictions. preds = model(inputs, meta["boxes"]) preds = preds.cpu() ori_boxes = meta["ori_boxes"].cpu() metadata = meta["metadata"].cpu() if cfg.NUM_GPUS > 1: preds = torch.cat(du.all_gather_unaligned(preds), dim=0) ori_boxes = torch.cat(du.all_gather_unaligned(ori_boxes), dim=0) metadata = torch.cat(du.all_gather_unaligned(metadata), dim=0) val_meter.iter_toc() # Update and log stats. val_meter.update_stats(preds.cpu(), ori_boxes.cpu(), metadata.cpu()) else: preds = model(inputs) if cfg.DATA.MULTI_LABEL: if cfg.NUM_GPUS > 1: preds, labels = du.all_gather([preds, labels]) else: if cfg.MODEL.NUM_CLASSES == 2: predsAll.extend(preds.detach().cpu().numpy()[:, -1]) labelsAll.extend(labels.detach().cpu().numpy()) # Compute the errors. num_topks_correct = metrics.topks_correct( preds, labels, (1, min(5, cfg.MODEL.NUM_CLASSES))) # Combine the errors across the GPUs. top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0 for x in num_topks_correct] if cfg.NUM_GPUS > 1: top1_err, top5_err = du.all_reduce( [top1_err, top5_err]) # Copy the errors from GPU to CPU (sync point). top1_err, top5_err = top1_err.item(), top5_err.item() val_meter.iter_toc() # Update and log stats. val_meter.update_stats(top1_err, top5_err, inputs[0].size(0) * cfg.NUM_GPUS) # write to tensorboard format if available. if writer is not None: writer.add_scalars( { "Val/Top1_err": top1_err, "Val/Top5_err": top5_err }, global_step=len(val_loader) * cur_epoch + cur_iter, ) if du.is_master_proc(): ite = len(val_loader) * cur_epoch + cur_iter self.logger.log_row(name='ValTop1', iter=ite, lr=top1_err, description="Top 1 Err") self.logger.log_row(name='ValTop5', iter=ite, lr=top5_err, description="Top 5 Err") val_meter.update_predictions(preds, labels) stats = val_meter.log_iter_stats(cur_epoch, cur_iter, predsAll, labelsAll) ite = dSize * cur_epoch + btch * (cur_iter + 1) self.plotStats(stats, ite, 'ValIter') val_meter.iter_tic() # Log epoch stats. gathered = du.all_gather([ torch.tensor(predsAll).to(torch.device("cuda")), torch.tensor(labelsAll).to(torch.device("cuda")) ]) stats = val_meter.log_epoch_stats(cur_epoch, gathered[0].detach().cpu().numpy(), gathered[1].detach().cpu().numpy()) ite = (cur_epoch + 1) * dSize self.plotStats(stats, ite, 'ValEpoch') # write to tensorboard format if available. if writer is not None: if cfg.DETECTION.ENABLE: writer.add_scalars({"Val/mAP": val_meter.full_map}, global_step=cur_epoch) all_preds_cpu = [ pred.clone().detach().cpu() for pred in val_meter.all_preds ] all_labels_cpu = [ label.clone().detach().cpu() for label in val_meter.all_labels ] # plotScatter(all_preds_cpu, all_labels_cpu, "Epoch_{}".format(cur_epoch)) # writer.plot_eval( # preds=all_preds_cpu, labels=all_labels_cpu, global_step=cur_epoch # ) val_meter.reset()
def perform_test(test_loader, model, test_meter, cfg, writer=None): """ For classification: Perform mutli-view testing that uniformly samples N clips from a video along its temporal axis. For each clip, it takes 3 crops to cover the spatial dimension, followed by averaging the softmax scores across all Nx3 views to form a video-level prediction. All video predictions are compared to ground-truth labels and the final testing performance is logged. For detection: Perform fully-convolutional testing on the full frames without crop. Args: test_loader (loader): video testing loader. model (model): the pretrained video model to test. test_meter (TestMeter): testing meters to log and ensemble the testing results. cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py writer (TensorboardWriter object, optional): TensorboardWriter object to writer Tensorboard log. """ # Enable eval mode. model.eval() test_meter.iter_tic() for cur_iter, (inputs, labels, video_idx, meta) in enumerate(test_loader): if cfg.NUM_GPUS: # Transfer the data to the current GPU device. if isinstance(inputs, (list, )): for i in range(len(inputs)): inputs[i] = inputs[i].cuda(non_blocking=True) else: inputs = inputs.cuda(non_blocking=True) # Transfer the data to the current GPU device. labels = labels.cuda() video_idx = video_idx.cuda() for key, val in meta.items(): if isinstance(val, (list, )): for i in range(len(val)): val[i] = val[i].cuda(non_blocking=True) else: meta[key] = val.cuda(non_blocking=True) test_meter.data_toc() if cfg.DETECTION.ENABLE: # Compute the predictions. preds = model(inputs, meta["boxes"]) ori_boxes = meta["ori_boxes"] metadata = meta["metadata"] preds = preds.detach().cpu() if cfg.NUM_GPUS else preds.detach() ori_boxes = (ori_boxes.detach().cpu() if cfg.NUM_GPUS else ori_boxes.detach()) metadata = (metadata.detach().cpu() if cfg.NUM_GPUS else metadata.detach()) if cfg.NUM_GPUS > 1: preds = torch.cat(du.all_gather_unaligned(preds), dim=0) ori_boxes = torch.cat(du.all_gather_unaligned(ori_boxes), dim=0) metadata = torch.cat(du.all_gather_unaligned(metadata), dim=0) test_meter.iter_toc() # Update and log stats. test_meter.update_stats(preds, ori_boxes, metadata) test_meter.log_iter_stats(None, cur_iter) else: # Perform the forward pass. preds = model(inputs) # Gather all the predictions across all the devices to perform ensemble. if cfg.NUM_GPUS > 1: preds, labels, video_idx = du.all_gather( [preds, labels, video_idx]) if cfg.NUM_GPUS: preds = preds.cpu() labels = labels.cpu() video_idx = video_idx.cpu() test_meter.iter_toc() # Update and log stats. test_meter.update_stats(preds.detach(), labels.detach(), video_idx.detach()) test_meter.log_iter_stats(cur_iter) test_meter.iter_tic() # Log epoch stats and print the final testing results. if not cfg.DETECTION.ENABLE: all_preds = test_meter.video_preds.clone().detach() all_labels = test_meter.video_labels if cfg.NUM_GPUS: all_preds = all_preds.cpu() all_labels = all_labels.cpu() if writer is not None: writer.plot_eval(preds=all_preds, labels=all_labels) if cfg.TEST.SAVE_RESULTS_PATH != "": save_path = os.path.join(cfg.OUTPUT_DIR, cfg.TEST.SAVE_RESULTS_PATH) with PathManager.open(save_path, "wb") as f: pickle.dump([all_labels, all_preds], f) logger.info("Successfully saved prediction results to {}".format( save_path)) test_meter.finalize_metrics() return test_meter
def eval_epoch(val_loader, model, val_meter, cur_epoch, cfg, writer=None): """ Evaluate the model on the val set. Args: val_loader (loader): data loader to provide validation data. model (model): model to evaluate the performance. val_meter (ValMeter): meter instance to record and calculate the metrics. cur_epoch (int): number of the current epoch of training. cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py writer (TensorboardWriter, optional): TensorboardWriter object to writer Tensorboard log. """ # Evaluation mode enabled. The running stats would not be updated. model.eval() val_meter.iter_tic() for cur_iter, (inputs, labels, _, meta) in enumerate(val_loader): # Transferthe data to the current GPU device. if isinstance(inputs, (list, )): for i in range(len(inputs)): inputs[i] = inputs[i].cuda(non_blocking=True) else: inputs = inputs.cuda(non_blocking=True) labels = labels.cuda() for key, val in meta.items(): if isinstance(val, (list, )): for i in range(len(val)): val[i] = val[i].cuda(non_blocking=True) else: meta[key] = val.cuda(non_blocking=True) if cfg.DETECTION.ENABLE: # Compute the predictions. preds = model(inputs, meta["boxes"]) preds = preds.cpu() ori_boxes = meta["ori_boxes"].cpu() metadata = meta["metadata"].cpu() if cfg.NUM_GPUS > 1: preds = torch.cat(du.all_gather_unaligned(preds), dim=0) ori_boxes = torch.cat(du.all_gather_unaligned(ori_boxes), dim=0) metadata = torch.cat(du.all_gather_unaligned(metadata), dim=0) val_meter.iter_toc() # Update and log stats. val_meter.update_stats(preds.cpu(), ori_boxes.cpu(), metadata.cpu()) else: preds = model(inputs) if cfg.DATA.MULTI_LABEL: if cfg.NUM_GPUS > 1: preds, labels = du.all_gather([preds, labels]) else: # Compute the errors. num_topks_correct = metrics.topks_correct( preds, labels, (1, 1)) # Combine the errors across the GPUs. top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0 for x in num_topks_correct] if cfg.NUM_GPUS > 1: top1_err, top5_err = du.all_reduce([top1_err, top5_err]) # Copy the errors from GPU to CPU (sync point). top1_err, top5_err = top1_err.item(), top5_err.item() val_meter.iter_toc() # Update and log stats. val_meter.update_stats(top1_err, top5_err, inputs[0].size(0) * cfg.NUM_GPUS) # write to tensorboard format if available. if writer is not None: writer.add_scalars( { "Val/Top1_err": top1_err, "Val/Top5_err": top5_err }, global_step=len(val_loader) * cur_epoch + cur_iter, ) val_meter.update_predictions(preds, labels) val_meter.log_iter_stats(cur_epoch, cur_iter) val_meter.iter_tic() logger.info('COMPUTING MCC') # Log epoch stats. val_meter.log_epoch_stats(cur_epoch) all_preds_cpu = [ pred.clone().detach().cpu() for pred in val_meter.all_preds ] all_labels_cpu = [ label.clone().detach().cpu() for label in val_meter.all_labels ] logger.info('PREPROC FOR MCC') preds = torch.cat(all_preds_cpu) ypreds = torch.argmax(preds, dim=1) ytrue = torch.cat(all_labels_cpu) logger.info('COMPUTE CM') cm = plmetrics.ConfusionMatrix()(ypreds.to('cuda'), ytrue.to('cuda')) logger.info('CM COMPUTED') tp, tn, fn, fp = cm[1, 1], cm[0, 0], cm[0, 1], cm[1, 0] denom = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn) mcc = (tp * tn - fp * fn) / torch.sqrt(denom) logger.info('COMPUTED MCC') # write to tensorboard format if available. if writer is not None: if cfg.DETECTION.ENABLE: writer.add_scalars({"Val/mAP": val_meter.full_map}, global_step=cur_epoch) writer.plot_eval(preds=all_preds_cpu, labels=all_labels_cpu, global_step=cur_epoch) val_meter.reset() return mcc.item()