def main(): args = parse_args() logger = logging.getLogger('demo') if not logger.isEnabledFor(logging.INFO): # setup_logger is not called setup_logger(output=args.output_dir, name='demo') logger.info(pprint.pformat(args)) logger.info(config) # cudnn related setting cudnn.benchmark = config.CUDNN.BENCHMARK cudnn.deterministic = config.CUDNN.DETERMINISTIC cudnn.enabled = config.CUDNN.ENABLED gpus = list(config.TEST.GPUS) if len(gpus) > 1: raise ValueError('Test only supports single core.') device = torch.device('cuda:{}'.format(gpus[0])) # build model model = build_segmentation_model_from_cfg(config) # Change ASPP image pooling # output_stride = 2 ** (5 - sum(config.MODEL.BACKBONE.DILATION)) # train_crop_h, train_crop_w = config.TEST.CROP_SIZE # scale = 1. / output_stride # pool_h = int((float(train_crop_h) - 1.0) * scale + 1.0) # pool_w = int((float(train_crop_w) - 1.0) * scale + 1.0) # model.set_image_pooling((pool_h, pool_w)) logger.info("Model:\n{}".format(model)) model = model.to(device) try: # build data_loader data_loader = build_test_loader_from_cfg(config) meta_dataset = data_loader.dataset save_intermediate_outputs = True except: logger.warning( "Cannot build data loader, using default meta data. This will disable visualizing intermediate outputs" ) if 'cityscapes' in config.DATASET.DATASET: meta_dataset = CityscapesMeta() else: raise ValueError("Unsupported dataset: {}".format( config.DATASET.DATASET)) save_intermediate_outputs = False # load model if config.TEST.MODEL_FILE: model_state_file = config.TEST.MODEL_FILE else: model_state_file = os.path.join(config.OUTPUT_DIR, 'final_state.pth') if os.path.isfile(model_state_file): model_weights = torch.load(model_state_file) if 'state_dict' in model_weights.keys(): model_weights = model_weights['state_dict'] logger.info('Evaluating a intermediate checkpoint.') model.load_state_dict(model_weights, strict=False) logger.info('Test model loaded from {}'.format(model_state_file)) else: if not config.DEBUG.DEBUG: raise ValueError('Cannot find test model.') # load images input_list = [] if os.path.exists(args.input_files): if os.path.isfile(args.input_files): # inference on a single file, extract extension ext = os.path.splitext(os.path.basename(args.input_files))[1] if ext in ['.png', '.jpg', '.jpeg']: # image file input_list.append(args.input_files) elif ext in ['.mpeg']: # video file # TODO: decode video and convert to image list raise NotImplementedError( "Inference on video is not supported yet.") else: raise ValueError("Unsupported extension: {}.".format(ext)) else: # inference on a directory for fname in glob.glob( os.path.join(args.input_files, '*' + args.extension)): input_list.append(fname) else: raise ValueError('Input file or directory does not exists: {}'.format( args.input_files)) if isinstance(input_list[0], str): logger.info("Inference on images") logger.info(input_list) else: logger.info("Inference on video") # dir to save intermediate raw outputs raw_out_dir = os.path.join(args.output_dir, 'raw') PathManager.mkdirs(raw_out_dir) # dir to save semantic outputs semantic_out_dir = os.path.join(args.output_dir, 'semantic') PathManager.mkdirs(semantic_out_dir) # dir to save instance outputs instance_out_dir = os.path.join(args.output_dir, 'instance') PathManager.mkdirs(instance_out_dir) # dir to save panoptic outputs panoptic_out_dir = os.path.join(args.output_dir, 'panoptic') PathManager.mkdirs(panoptic_out_dir) # Test loop model.eval() # build image demo transform transforms = T.Compose( [T.ToTensor(), T.Normalize(config.DATASET.MEAN, config.DATASET.STD)]) net_time = AverageMeter() post_time = AverageMeter() try: with torch.no_grad(): for i, fname in enumerate(input_list): if isinstance(fname, str): # load image raw_image = read_image(fname, 'RGB') else: NotImplementedError( "Inference on video is not supported yet.") # pad image raw_shape = raw_image.shape[:2] raw_h = raw_shape[0] raw_w = raw_shape[1] new_h = (raw_h + 31) // 32 * 32 + 1 new_w = (raw_w + 31) // 32 * 32 + 1 input_image = np.zeros((new_h, new_w, 3), dtype=np.uint8) input_image[:, :] = config.DATASET.MEAN input_image[:raw_h, :raw_w, :] = raw_image image, _ = transforms(input_image, None) image = image.unsqueeze(0).to(device) # network start_time = time.time() out_dict = model(image) torch.cuda.synchronize(device) net_time.update(time.time() - start_time) # post-processing start_time = time.time() semantic_pred = get_semantic_segmentation(out_dict['semantic']) panoptic_pred, center_pred = get_panoptic_segmentation( semantic_pred, out_dict['center'], out_dict['offset'], thing_list=meta_dataset.thing_list, label_divisor=meta_dataset.label_divisor, stuff_area=config.POST_PROCESSING.STUFF_AREA, void_label=(meta_dataset.label_divisor * meta_dataset.ignore_label), threshold=config.POST_PROCESSING.CENTER_THRESHOLD, nms_kernel=config.POST_PROCESSING.NMS_KERNEL, top_k=config.POST_PROCESSING.TOP_K_INSTANCE, foreground_mask=None) torch.cuda.synchronize(device) post_time.update(time.time() - start_time) logger.info( '[{}/{}]\t' 'Network Time: {net_time.val:.3f}s ({net_time.avg:.3f}s)\t' 'Post-processing Time: {post_time.val:.3f}s ({post_time.avg:.3f}s)\t' .format(i, len(input_list), net_time=net_time, post_time=post_time)) # save predictions semantic_pred = semantic_pred.squeeze(0).cpu().numpy() panoptic_pred = panoptic_pred.squeeze(0).cpu().numpy() # crop predictions semantic_pred = semantic_pred[:raw_h, :raw_w] panoptic_pred = panoptic_pred[:raw_h, :raw_w] if save_intermediate_outputs: # Raw outputs save_debug_images( dataset=meta_dataset, batch_images=image, batch_targets={}, batch_outputs=out_dict, out_dir=raw_out_dir, iteration=i, target_keys=[], output_keys=['semantic', 'center', 'offset'], is_train=False, ) save_annotation(semantic_pred, semantic_out_dir, 'semantic_pred_%d' % i, add_colormap=True, colormap=meta_dataset.create_label_colormap(), image=raw_image if args.merge_image else None) pan_to_sem = panoptic_pred // meta_dataset.label_divisor save_annotation(pan_to_sem, semantic_out_dir, 'panoptic_to_semantic_pred_%d' % i, add_colormap=True, colormap=meta_dataset.create_label_colormap(), image=raw_image if args.merge_image else None) ins_id = panoptic_pred % meta_dataset.label_divisor pan_to_ins = panoptic_pred.copy() pan_to_ins[ins_id == 0] = 0 save_instance_annotation( pan_to_ins, instance_out_dir, 'panoptic_to_instance_pred_%d' % i, image=raw_image if args.merge_image else None) save_panoptic_annotation( panoptic_pred, panoptic_out_dir, 'panoptic_pred_%d' % i, label_divisor=meta_dataset.label_divisor, colormap=meta_dataset.create_label_colormap(), image=raw_image if args.merge_image else None) except Exception: logger.exception("Exception during demo:") raise finally: logger.info("Demo finished.") if save_intermediate_outputs: logger.info("Intermediate outputs saved to {}".format(raw_out_dir)) logger.info( "Semantic predictions saved to {}".format(semantic_out_dir)) logger.info( "Instance predictions saved to {}".format(instance_out_dir)) logger.info( "Panoptic predictions saved to {}".format(panoptic_out_dir))
def main(self, frame, index, total): self.model.eval() # build image demo transform transforms = T.Compose([ T.ToTensor(), T.Normalize(config.DATASET.MEAN, config.DATASET.STD) ]) net_time = AverageMeter() post_time = AverageMeter() try: with torch.no_grad(): raw_image = frame # pad image raw_shape = raw_image.shape[:2] raw_h = raw_shape[0] raw_w = raw_shape[1] new_h = (raw_h + 31) // 32 * 32 + 1 new_w = (raw_w + 31) // 32 * 32 + 1 input_image = np.zeros((new_h, new_w, 3), dtype=np.uint8) input_image[:, :] = config.DATASET.MEAN input_image[:raw_h, :raw_w, :] = raw_image image, _ = transforms(input_image, None) image = image.unsqueeze(0).to(self.device) # network start_time = time.time() out_dict = self.model(image) torch.cuda.synchronize(self.device) net_time.update(time.time() - start_time) # post-processing start_time = time.time() semantic_pred = get_semantic_segmentation(out_dict['semantic']) panoptic_pred, center_pred = get_panoptic_segmentation( semantic_pred, out_dict['center'], out_dict['offset'], thing_list=self.meta_dataset.thing_list, label_divisor=self.meta_dataset.label_divisor, stuff_area=config.POST_PROCESSING.STUFF_AREA, void_label=(self.meta_dataset.label_divisor * self.meta_dataset.ignore_label), threshold=config.POST_PROCESSING.CENTER_THRESHOLD, nms_kernel=config.POST_PROCESSING.NMS_KERNEL, top_k=config.POST_PROCESSING.TOP_K_INSTANCE, foreground_mask=None) torch.cuda.synchronize(self.device) post_time.update(time.time() - start_time) self.logger.info( '[{}/{}]\t' 'Network Time: {net_time.val:.3f}s ({net_time.avg:.3f}s)\t' 'Post-processing Time: {post_time.val:.3f}s ({post_time.avg:.3f}s)\t' .format(index, total, net_time=net_time, post_time=post_time)) # save predictions #semantic_pred = semantic_pred.squeeze(0).cpu().numpy() panoptic_pred = panoptic_pred.squeeze(0).cpu().numpy() # crop predictions #semantic_pred = semantic_pred[:raw_h, :raw_w] panoptic_pred = panoptic_pred[:raw_h, :raw_w] frame = creat_panoptic_annotation( panoptic_pred, label_divisor=self.meta_dataset.label_divisor, colormap=self.meta_dataset.create_label_colormap(), image=raw_image) except Exception: self.logger.exception("Exception during demo:") raise finally: self.logger.info("Demo finished.") return frame
def multi_scale_inference(config, model, raw_image, t_image, device): scales = config.TEST.SCALE_LIST flip = config.TEST.FLIP_TEST # output_stride = 2 ** (5 - sum(config.MODEL.BACKBONE.DILATION)) # train_crop_h, train_crop_w = config.TEST.CROP_SIZE # scale = 1. / output_stride # pool_h = int((float(train_crop_h) - 1.0) * scale + 1.0) # pool_w = int((float(train_crop_w) - 1.0) * scale + 1.0) # transforms transforms = T.Compose( [T.ToTensor(), T.Normalize(config.DATASET.MEAN, config.DATASET.STD)]) if flip: flip_range = 2 else: flip_range = 1 # h,w,_ = raw_image.shape _, _, h, w = t_image.shape org_h_pad = (h + 31) // 32 * 32 org_w_pad = (w + 31) // 32 * 32 sum_semantic_with_flip = 0 sum_center_with_flip = 0 sum_offset_with_flip = 0 for i in range(len(scales)): image = raw_image scale = scales[i] raw_h = int(h * scale) raw_w = int(w * scale) image = cv2.resize(image, None, fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR).astype(np.int32) nh, nw, _ = image.shape # pad image new_h = (raw_h + 31) // 32 * 32 new_w = (raw_w + 31) // 32 * 32 input_image = np.zeros((new_h, new_w, 3), dtype=np.uint8) input_image[:, :] = config.DATASET.MEAN # input_image[:raw_h, :raw_w, :] = image input_image[:nh, :nw, :] = image image, _ = transforms(input_image, None) image = image.unsqueeze(0).to(device) model = model.to(device) for flip in range(flip_range): if flip: image = flip_tensor(image, 3) out_dict = model(image) for key in out_dict.keys(): # return to raw_input shape out_dict[key] = out_dict[key][:, :, :raw_h, :raw_w] if raw_h != org_h_pad or raw_w != org_w_pad: out_dict = upsample_predictions(out_dict, (org_h_pad, org_w_pad), scale) # average softmax or logit? semantic_pred = out_dict['semantic'] # semantic_pred = F.softmax(out_dict['semantic'],dim=1) center_pred = out_dict['center'] offset_pred = out_dict['offset'] if flip: semantic_pred = flip_tensor(semantic_pred, 3) center_pred = flip_tensor(center_pred, 3) offset_pred = flip_tensor(offset_pred, 3) offset_pred[:, 1, :, :] *= (-1) sum_semantic_with_flip += semantic_pred sum_center_with_flip += center_pred sum_offset_with_flip += offset_pred semantic_mean = sum_semantic_with_flip / (flip_range * len(scales)) center_mean = sum_center_with_flip / (flip_range * len(scales)) offset_mean = sum_offset_with_flip / (flip_range * len(scales)) out_dict['semantic'] = semantic_mean out_dict['center'] = center_mean out_dict['offset'] = offset_mean return out_dict
def main(): args = parse_args() logger = logging.getLogger('segment_video.py') if not logger.isEnabledFor(logging.INFO): # setup_logger is not called setup_logger(output=args.output_dir, name='demo') logger.info(pprint.pformat(args)) logger.info(config) # cudnn related setting cudnn.benchmark = config.CUDNN.BENCHMARK cudnn.deterministic = config.CUDNN.DETERMINISTIC cudnn.enabled = config.CUDNN.ENABLED gpus = list(config.TEST.GPUS) if len(gpus) > 1: raise ValueError('Test only supports single core.') device = torch.device('cuda:{}'.format(gpus[0])) # build model model = build_segmentation_model_from_cfg(config) logger.info("Model:\n{}".format(model)) model = model.to(device) meta_dataset = CityscapesMeta() # load model if config.TEST.MODEL_FILE: model_state_file = config.TEST.MODEL_FILE else: model_state_file = os.path.join(config.OUTPUT_DIR, 'final_state.pth') if os.path.isfile(model_state_file): model_weights = torch.load(model_state_file) if 'state_dict' in model_weights.keys(): model_weights = model_weights['state_dict'] logger.info('Evaluating a intermediate checkpoint.') model.load_state_dict(model_weights, strict=True) logger.info('Test model loaded from {}'.format(model_state_file)) else: if not config.DEBUG.DEBUG: raise ValueError('Cannot find test model.') model.eval() # load images cap = None if os.path.exists(args.input): if os.path.isfile(args.input): # extract extension ext = os.path.splitext(os.path.basename(args.input))[1] if ext in ['.mpeg', '.mp4']: cap = cv2.VideoCapture(args.input) else: raise ValueError("Unsupported extension: {}.".format(ext)) else: raise ValueError( "Input must be a file, not a directory: {}".format(args.input)) else: raise ValueError('Input file does not exists: {}'.format(args.input)) # dir to save panoptic outputs panoptic_out_dir = os.path.join(args.output_dir, 'panoptic') PathManager.mkdirs(panoptic_out_dir) # build image demo transform transforms = T.Compose( [T.ToTensor(), T.Normalize(config.DATASET.MEAN, config.DATASET.STD)]) # Get video information length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = cap.get(cv2.CAP_PROP_FPS) # Define the codec and create VideoWriter object fourcc = cv2.VideoWriter_fourcc(*'MJPG') out = cv2.VideoWriter(args.output_dir + '/output.avi', fourcc, fps, (width, height)) try: with torch.no_grad(): pbar = tqdm(total=length) ii = 0 while (cap.isOpened()): ret, raw_image = cap.read() if ret: # pad image raw_shape = raw_image.shape[:2] raw_h = raw_shape[0] raw_w = raw_shape[1] new_h = (raw_h + 31) // 32 * 32 + 1 new_w = (raw_w + 31) // 32 * 32 + 1 input_image = np.zeros((new_h, new_w, 3), dtype=np.uint8) input_image[:, :] = config.DATASET.MEAN input_image[:raw_h, :raw_w, :] = raw_image image, _ = transforms(input_image, None) image = image.unsqueeze(0).to(device) # network out_dict = model(image) torch.cuda.synchronize(device) # post-processing semantic_pred = get_semantic_segmentation( out_dict['semantic']) panoptic_pred, center_pred = get_panoptic_segmentation( semantic_pred, out_dict['center'], out_dict['offset'], thing_list=meta_dataset.thing_list, label_divisor=meta_dataset.label_divisor, stuff_area=config.POST_PROCESSING.STUFF_AREA, void_label=(meta_dataset.label_divisor * meta_dataset.ignore_label), threshold=config.POST_PROCESSING.CENTER_THRESHOLD, nms_kernel=config.POST_PROCESSING.NMS_KERNEL, top_k=config.POST_PROCESSING.TOP_K_INSTANCE, foreground_mask=None) torch.cuda.synchronize(device) # Send predictions to cpu center_pred = center_pred.squeeze(0).cpu().numpy() semantic_pred = semantic_pred.squeeze(0).cpu().numpy() panoptic_pred = panoptic_pred.squeeze(0).cpu().numpy() # Crop predictions semantic_pred = semantic_pred[:raw_h, :raw_w] panoptic_pred = panoptic_pred[:raw_h, :raw_w] # Save predictions pil_image = save_panoptic_annotation( panoptic_pred, panoptic_out_dir, 'panoptic_pred_%d' % ii, label_divisor=meta_dataset.label_divisor, center_pred=center_pred, colormap=meta_dataset.create_label_colormap(), labelmap=meta_dataset.create_label_stringmap() if args.text_labels else None, image=raw_image) ii += 1 # Write image to video file np_image = np.asarray(pil_image) np_image = np_image[:, :, :: -1] # flip channels, OpenCV uses BGR out.write(np_image) # Update progress bar pbar.update(1) else: break pbar.close() # Release everything if job is finished cap.release() out.release() cv2.destroyAllWindows() except Exception: logger.exception("Exception during segment_video.py:") raise finally: logger.info("Segmenting video finished.") logger.info( "Panoptic predictions saved to {}".format(panoptic_out_dir))