Example #1
0
def main():
    args = parse_args()

    logger = logging.getLogger('demo')
    if not logger.isEnabledFor(logging.INFO):  # setup_logger is not called
        setup_logger(output=args.output_dir, name='demo')

    logger.info(pprint.pformat(args))
    logger.info(config)

    # cudnn related setting
    cudnn.benchmark = config.CUDNN.BENCHMARK
    cudnn.deterministic = config.CUDNN.DETERMINISTIC
    cudnn.enabled = config.CUDNN.ENABLED
    gpus = list(config.TEST.GPUS)
    if len(gpus) > 1:
        raise ValueError('Test only supports single core.')
    device = torch.device('cuda:{}'.format(gpus[0]))

    # build model
    model = build_segmentation_model_from_cfg(config)

    # Change ASPP image pooling
    # output_stride = 2 ** (5 - sum(config.MODEL.BACKBONE.DILATION))
    # train_crop_h, train_crop_w = config.TEST.CROP_SIZE
    # scale = 1. / output_stride
    # pool_h = int((float(train_crop_h) - 1.0) * scale + 1.0)
    # pool_w = int((float(train_crop_w) - 1.0) * scale + 1.0)

    # model.set_image_pooling((pool_h, pool_w))

    logger.info("Model:\n{}".format(model))
    model = model.to(device)

    try:
        # build data_loader
        data_loader = build_test_loader_from_cfg(config)
        meta_dataset = data_loader.dataset
        save_intermediate_outputs = True
    except:
        logger.warning(
            "Cannot build data loader, using default meta data. This will disable visualizing intermediate outputs"
        )
        if 'cityscapes' in config.DATASET.DATASET:
            meta_dataset = CityscapesMeta()
        else:
            raise ValueError("Unsupported dataset: {}".format(
                config.DATASET.DATASET))
        save_intermediate_outputs = False

    # load model
    if config.TEST.MODEL_FILE:
        model_state_file = config.TEST.MODEL_FILE
    else:
        model_state_file = os.path.join(config.OUTPUT_DIR, 'final_state.pth')

    if os.path.isfile(model_state_file):
        model_weights = torch.load(model_state_file)
        if 'state_dict' in model_weights.keys():
            model_weights = model_weights['state_dict']
            logger.info('Evaluating a intermediate checkpoint.')
        model.load_state_dict(model_weights, strict=False)
        logger.info('Test model loaded from {}'.format(model_state_file))
    else:
        if not config.DEBUG.DEBUG:
            raise ValueError('Cannot find test model.')

    # load images
    input_list = []
    if os.path.exists(args.input_files):
        if os.path.isfile(args.input_files):
            # inference on a single file, extract extension
            ext = os.path.splitext(os.path.basename(args.input_files))[1]
            if ext in ['.png', '.jpg', '.jpeg']:
                # image file
                input_list.append(args.input_files)
            elif ext in ['.mpeg']:
                # video file
                # TODO: decode video and convert to image list
                raise NotImplementedError(
                    "Inference on video is not supported yet.")
            else:
                raise ValueError("Unsupported extension: {}.".format(ext))
        else:
            # inference on a directory
            for fname in glob.glob(
                    os.path.join(args.input_files, '*' + args.extension)):
                input_list.append(fname)
    else:
        raise ValueError('Input file or directory does not exists: {}'.format(
            args.input_files))

    if isinstance(input_list[0], str):
        logger.info("Inference on images")
        logger.info(input_list)
    else:
        logger.info("Inference on video")

    # dir to save intermediate raw outputs
    raw_out_dir = os.path.join(args.output_dir, 'raw')
    PathManager.mkdirs(raw_out_dir)

    # dir to save semantic outputs
    semantic_out_dir = os.path.join(args.output_dir, 'semantic')
    PathManager.mkdirs(semantic_out_dir)

    # dir to save instance outputs
    instance_out_dir = os.path.join(args.output_dir, 'instance')
    PathManager.mkdirs(instance_out_dir)

    # dir to save panoptic outputs
    panoptic_out_dir = os.path.join(args.output_dir, 'panoptic')
    PathManager.mkdirs(panoptic_out_dir)

    # Test loop
    model.eval()

    # build image demo transform
    transforms = T.Compose(
        [T.ToTensor(),
         T.Normalize(config.DATASET.MEAN, config.DATASET.STD)])

    net_time = AverageMeter()
    post_time = AverageMeter()
    try:
        with torch.no_grad():
            for i, fname in enumerate(input_list):
                if isinstance(fname, str):
                    # load image
                    raw_image = read_image(fname, 'RGB')
                else:
                    NotImplementedError(
                        "Inference on video is not supported yet.")

                # pad image
                raw_shape = raw_image.shape[:2]
                raw_h = raw_shape[0]
                raw_w = raw_shape[1]
                new_h = (raw_h + 31) // 32 * 32 + 1
                new_w = (raw_w + 31) // 32 * 32 + 1
                input_image = np.zeros((new_h, new_w, 3), dtype=np.uint8)
                input_image[:, :] = config.DATASET.MEAN
                input_image[:raw_h, :raw_w, :] = raw_image

                image, _ = transforms(input_image, None)
                image = image.unsqueeze(0).to(device)

                # network
                start_time = time.time()
                out_dict = model(image)
                torch.cuda.synchronize(device)
                net_time.update(time.time() - start_time)

                # post-processing
                start_time = time.time()
                semantic_pred = get_semantic_segmentation(out_dict['semantic'])

                panoptic_pred, center_pred = get_panoptic_segmentation(
                    semantic_pred,
                    out_dict['center'],
                    out_dict['offset'],
                    thing_list=meta_dataset.thing_list,
                    label_divisor=meta_dataset.label_divisor,
                    stuff_area=config.POST_PROCESSING.STUFF_AREA,
                    void_label=(meta_dataset.label_divisor *
                                meta_dataset.ignore_label),
                    threshold=config.POST_PROCESSING.CENTER_THRESHOLD,
                    nms_kernel=config.POST_PROCESSING.NMS_KERNEL,
                    top_k=config.POST_PROCESSING.TOP_K_INSTANCE,
                    foreground_mask=None)
                torch.cuda.synchronize(device)
                post_time.update(time.time() - start_time)

                logger.info(
                    '[{}/{}]\t'
                    'Network Time: {net_time.val:.3f}s ({net_time.avg:.3f}s)\t'
                    'Post-processing Time: {post_time.val:.3f}s ({post_time.avg:.3f}s)\t'
                    .format(i,
                            len(input_list),
                            net_time=net_time,
                            post_time=post_time))

                # save predictions
                semantic_pred = semantic_pred.squeeze(0).cpu().numpy()
                panoptic_pred = panoptic_pred.squeeze(0).cpu().numpy()

                # crop predictions
                semantic_pred = semantic_pred[:raw_h, :raw_w]
                panoptic_pred = panoptic_pred[:raw_h, :raw_w]

                if save_intermediate_outputs:
                    # Raw outputs
                    save_debug_images(
                        dataset=meta_dataset,
                        batch_images=image,
                        batch_targets={},
                        batch_outputs=out_dict,
                        out_dir=raw_out_dir,
                        iteration=i,
                        target_keys=[],
                        output_keys=['semantic', 'center', 'offset'],
                        is_train=False,
                    )

                save_annotation(semantic_pred,
                                semantic_out_dir,
                                'semantic_pred_%d' % i,
                                add_colormap=True,
                                colormap=meta_dataset.create_label_colormap(),
                                image=raw_image if args.merge_image else None)
                pan_to_sem = panoptic_pred // meta_dataset.label_divisor
                save_annotation(pan_to_sem,
                                semantic_out_dir,
                                'panoptic_to_semantic_pred_%d' % i,
                                add_colormap=True,
                                colormap=meta_dataset.create_label_colormap(),
                                image=raw_image if args.merge_image else None)
                ins_id = panoptic_pred % meta_dataset.label_divisor
                pan_to_ins = panoptic_pred.copy()
                pan_to_ins[ins_id == 0] = 0
                save_instance_annotation(
                    pan_to_ins,
                    instance_out_dir,
                    'panoptic_to_instance_pred_%d' % i,
                    image=raw_image if args.merge_image else None)
                save_panoptic_annotation(
                    panoptic_pred,
                    panoptic_out_dir,
                    'panoptic_pred_%d' % i,
                    label_divisor=meta_dataset.label_divisor,
                    colormap=meta_dataset.create_label_colormap(),
                    image=raw_image if args.merge_image else None)
    except Exception:
        logger.exception("Exception during demo:")
        raise
    finally:
        logger.info("Demo finished.")
        if save_intermediate_outputs:
            logger.info("Intermediate outputs saved to {}".format(raw_out_dir))
        logger.info(
            "Semantic predictions saved to {}".format(semantic_out_dir))
        logger.info(
            "Instance predictions saved to {}".format(instance_out_dir))
        logger.info(
            "Panoptic predictions saved to {}".format(panoptic_out_dir))
Example #2
0
    def main(self, frame, index, total):
        self.model.eval()

        # build image demo transform
        transforms = T.Compose([
            T.ToTensor(),
            T.Normalize(config.DATASET.MEAN, config.DATASET.STD)
        ])

        net_time = AverageMeter()
        post_time = AverageMeter()
        try:
            with torch.no_grad():
                raw_image = frame
                # pad image
                raw_shape = raw_image.shape[:2]
                raw_h = raw_shape[0]
                raw_w = raw_shape[1]
                new_h = (raw_h + 31) // 32 * 32 + 1
                new_w = (raw_w + 31) // 32 * 32 + 1
                input_image = np.zeros((new_h, new_w, 3), dtype=np.uint8)
                input_image[:, :] = config.DATASET.MEAN
                input_image[:raw_h, :raw_w, :] = raw_image

                image, _ = transforms(input_image, None)
                image = image.unsqueeze(0).to(self.device)

                # network
                start_time = time.time()
                out_dict = self.model(image)
                torch.cuda.synchronize(self.device)
                net_time.update(time.time() - start_time)

                # post-processing
                start_time = time.time()
                semantic_pred = get_semantic_segmentation(out_dict['semantic'])

                panoptic_pred, center_pred = get_panoptic_segmentation(
                    semantic_pred,
                    out_dict['center'],
                    out_dict['offset'],
                    thing_list=self.meta_dataset.thing_list,
                    label_divisor=self.meta_dataset.label_divisor,
                    stuff_area=config.POST_PROCESSING.STUFF_AREA,
                    void_label=(self.meta_dataset.label_divisor *
                                self.meta_dataset.ignore_label),
                    threshold=config.POST_PROCESSING.CENTER_THRESHOLD,
                    nms_kernel=config.POST_PROCESSING.NMS_KERNEL,
                    top_k=config.POST_PROCESSING.TOP_K_INSTANCE,
                    foreground_mask=None)
                torch.cuda.synchronize(self.device)
                post_time.update(time.time() - start_time)

                self.logger.info(
                    '[{}/{}]\t'
                    'Network Time: {net_time.val:.3f}s ({net_time.avg:.3f}s)\t'
                    'Post-processing Time: {post_time.val:.3f}s ({post_time.avg:.3f}s)\t'
                    .format(index,
                            total,
                            net_time=net_time,
                            post_time=post_time))

                # save predictions
                #semantic_pred = semantic_pred.squeeze(0).cpu().numpy()
                panoptic_pred = panoptic_pred.squeeze(0).cpu().numpy()

                # crop predictions
                #semantic_pred = semantic_pred[:raw_h, :raw_w]
                panoptic_pred = panoptic_pred[:raw_h, :raw_w]

                frame = creat_panoptic_annotation(
                    panoptic_pred,
                    label_divisor=self.meta_dataset.label_divisor,
                    colormap=self.meta_dataset.create_label_colormap(),
                    image=raw_image)
        except Exception:
            self.logger.exception("Exception during demo:")
            raise
        finally:
            self.logger.info("Demo finished.")
            return frame
Example #3
0
def multi_scale_inference(config, model, raw_image, t_image, device):
    scales = config.TEST.SCALE_LIST
    flip = config.TEST.FLIP_TEST
    # output_stride = 2 ** (5 - sum(config.MODEL.BACKBONE.DILATION))
    # train_crop_h, train_crop_w = config.TEST.CROP_SIZE
    # scale = 1. / output_stride
    # pool_h = int((float(train_crop_h) - 1.0) * scale + 1.0)
    # pool_w = int((float(train_crop_w) - 1.0) * scale + 1.0)
    # transforms
    transforms = T.Compose(
        [T.ToTensor(),
         T.Normalize(config.DATASET.MEAN, config.DATASET.STD)])
    if flip:
        flip_range = 2
    else:
        flip_range = 1

    # h,w,_ = raw_image.shape
    _, _, h, w = t_image.shape
    org_h_pad = (h + 31) // 32 * 32
    org_w_pad = (w + 31) // 32 * 32

    sum_semantic_with_flip = 0
    sum_center_with_flip = 0
    sum_offset_with_flip = 0

    for i in range(len(scales)):
        image = raw_image
        scale = scales[i]
        raw_h = int(h * scale)
        raw_w = int(w * scale)

        image = cv2.resize(image,
                           None,
                           fx=scale,
                           fy=scale,
                           interpolation=cv2.INTER_LINEAR).astype(np.int32)
        nh, nw, _ = image.shape

        # pad image
        new_h = (raw_h + 31) // 32 * 32
        new_w = (raw_w + 31) // 32 * 32
        input_image = np.zeros((new_h, new_w, 3), dtype=np.uint8)
        input_image[:, :] = config.DATASET.MEAN
        # input_image[:raw_h, :raw_w, :] = image
        input_image[:nh, :nw, :] = image

        image, _ = transforms(input_image, None)
        image = image.unsqueeze(0).to(device)

        model = model.to(device)

        for flip in range(flip_range):
            if flip:
                image = flip_tensor(image, 3)
            out_dict = model(image)
            for key in out_dict.keys():  # return to raw_input shape
                out_dict[key] = out_dict[key][:, :, :raw_h, :raw_w]

            if raw_h != org_h_pad or raw_w != org_w_pad:
                out_dict = upsample_predictions(out_dict,
                                                (org_h_pad, org_w_pad), scale)

            # average softmax or logit?
            semantic_pred = out_dict['semantic']
            # semantic_pred = F.softmax(out_dict['semantic'],dim=1)

            center_pred = out_dict['center']
            offset_pred = out_dict['offset']
            if flip:
                semantic_pred = flip_tensor(semantic_pred, 3)
                center_pred = flip_tensor(center_pred, 3)
                offset_pred = flip_tensor(offset_pred, 3)
                offset_pred[:, 1, :, :] *= (-1)

            sum_semantic_with_flip += semantic_pred
            sum_center_with_flip += center_pred
            sum_offset_with_flip += offset_pred

    semantic_mean = sum_semantic_with_flip / (flip_range * len(scales))
    center_mean = sum_center_with_flip / (flip_range * len(scales))
    offset_mean = sum_offset_with_flip / (flip_range * len(scales))

    out_dict['semantic'] = semantic_mean
    out_dict['center'] = center_mean
    out_dict['offset'] = offset_mean
    return out_dict
Example #4
0
def main():
    args = parse_args()

    logger = logging.getLogger('segment_video.py')
    if not logger.isEnabledFor(logging.INFO):  # setup_logger is not called
        setup_logger(output=args.output_dir, name='demo')

    logger.info(pprint.pformat(args))
    logger.info(config)

    # cudnn related setting
    cudnn.benchmark = config.CUDNN.BENCHMARK
    cudnn.deterministic = config.CUDNN.DETERMINISTIC
    cudnn.enabled = config.CUDNN.ENABLED
    gpus = list(config.TEST.GPUS)
    if len(gpus) > 1:
        raise ValueError('Test only supports single core.')
    device = torch.device('cuda:{}'.format(gpus[0]))

    # build model
    model = build_segmentation_model_from_cfg(config)

    logger.info("Model:\n{}".format(model))
    model = model.to(device)
    meta_dataset = CityscapesMeta()

    # load model
    if config.TEST.MODEL_FILE:
        model_state_file = config.TEST.MODEL_FILE
    else:
        model_state_file = os.path.join(config.OUTPUT_DIR, 'final_state.pth')

    if os.path.isfile(model_state_file):
        model_weights = torch.load(model_state_file)
        if 'state_dict' in model_weights.keys():
            model_weights = model_weights['state_dict']
            logger.info('Evaluating a intermediate checkpoint.')
        model.load_state_dict(model_weights, strict=True)
        logger.info('Test model loaded from {}'.format(model_state_file))
    else:
        if not config.DEBUG.DEBUG:
            raise ValueError('Cannot find test model.')

    model.eval()

    # load images
    cap = None
    if os.path.exists(args.input):
        if os.path.isfile(args.input):
            # extract extension
            ext = os.path.splitext(os.path.basename(args.input))[1]
            if ext in ['.mpeg', '.mp4']:
                cap = cv2.VideoCapture(args.input)
            else:
                raise ValueError("Unsupported extension: {}.".format(ext))
        else:
            raise ValueError(
                "Input must be a file, not a directory: {}".format(args.input))
    else:
        raise ValueError('Input file does not exists: {}'.format(args.input))

    # dir to save panoptic outputs
    panoptic_out_dir = os.path.join(args.output_dir, 'panoptic')
    PathManager.mkdirs(panoptic_out_dir)

    # build image demo transform
    transforms = T.Compose(
        [T.ToTensor(),
         T.Normalize(config.DATASET.MEAN, config.DATASET.STD)])

    # Get video information
    length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)

    # Define the codec and create VideoWriter object
    fourcc = cv2.VideoWriter_fourcc(*'MJPG')
    out = cv2.VideoWriter(args.output_dir + '/output.avi', fourcc, fps,
                          (width, height))

    try:
        with torch.no_grad():
            pbar = tqdm(total=length)
            ii = 0
            while (cap.isOpened()):
                ret, raw_image = cap.read()
                if ret:

                    # pad image
                    raw_shape = raw_image.shape[:2]
                    raw_h = raw_shape[0]
                    raw_w = raw_shape[1]
                    new_h = (raw_h + 31) // 32 * 32 + 1
                    new_w = (raw_w + 31) // 32 * 32 + 1
                    input_image = np.zeros((new_h, new_w, 3), dtype=np.uint8)
                    input_image[:, :] = config.DATASET.MEAN
                    input_image[:raw_h, :raw_w, :] = raw_image

                    image, _ = transforms(input_image, None)
                    image = image.unsqueeze(0).to(device)

                    # network
                    out_dict = model(image)
                    torch.cuda.synchronize(device)

                    # post-processing
                    semantic_pred = get_semantic_segmentation(
                        out_dict['semantic'])

                    panoptic_pred, center_pred = get_panoptic_segmentation(
                        semantic_pred,
                        out_dict['center'],
                        out_dict['offset'],
                        thing_list=meta_dataset.thing_list,
                        label_divisor=meta_dataset.label_divisor,
                        stuff_area=config.POST_PROCESSING.STUFF_AREA,
                        void_label=(meta_dataset.label_divisor *
                                    meta_dataset.ignore_label),
                        threshold=config.POST_PROCESSING.CENTER_THRESHOLD,
                        nms_kernel=config.POST_PROCESSING.NMS_KERNEL,
                        top_k=config.POST_PROCESSING.TOP_K_INSTANCE,
                        foreground_mask=None)
                    torch.cuda.synchronize(device)

                    # Send predictions to cpu
                    center_pred = center_pred.squeeze(0).cpu().numpy()
                    semantic_pred = semantic_pred.squeeze(0).cpu().numpy()
                    panoptic_pred = panoptic_pred.squeeze(0).cpu().numpy()

                    # Crop predictions
                    semantic_pred = semantic_pred[:raw_h, :raw_w]
                    panoptic_pred = panoptic_pred[:raw_h, :raw_w]

                    # Save predictions
                    pil_image = save_panoptic_annotation(
                        panoptic_pred,
                        panoptic_out_dir,
                        'panoptic_pred_%d' % ii,
                        label_divisor=meta_dataset.label_divisor,
                        center_pred=center_pred,
                        colormap=meta_dataset.create_label_colormap(),
                        labelmap=meta_dataset.create_label_stringmap()
                        if args.text_labels else None,
                        image=raw_image)

                    ii += 1

                    # Write image to video file
                    np_image = np.asarray(pil_image)
                    np_image = np_image[:, :, ::
                                        -1]  # flip channels, OpenCV uses BGR
                    out.write(np_image)

                    # Update progress bar
                    pbar.update(1)
                else:
                    break

        pbar.close()
        # Release everything if job is finished
        cap.release()
        out.release()
        cv2.destroyAllWindows()

    except Exception:
        logger.exception("Exception during segment_video.py:")
        raise
    finally:
        logger.info("Segmenting video finished.")
        logger.info(
            "Panoptic predictions saved to {}".format(panoptic_out_dir))