Esempio n. 1
0
def do_inference(cfg, model, sample: DataSample, transforms=None,
                 given_detection: DataSample = None) -> DataSample:
    """
    Do inference on a specific video (sample)
    :param cfg: configuration file of the model
    :param model: a pytorch model
    :param sample: a testing video
    :param transforms: image-wise transform that prepares
           video frames for processing
    :param given_detection: the cached detections from other model,
           it means that the detection branch is disabled in the
           model forward pass
    :return: the detection results in the format of DataSample
    """
    logger = logging.getLogger(__name__)
    model.eval()
    gpu_device = torch.device('cuda')

    video_loader = build_video_loader(cfg, sample, transforms)

    sample_result = DataSample(sample.id, raw_info=None, metadata=sample.metadata)
    network_time = 0
    for (video_clip, frame_id, timestamps) in tqdm(video_loader):
        frame_id = frame_id.item()
        timestamps = torch.squeeze(timestamps, dim=0).tolist()
        video_clip = torch.squeeze(video_clip, dim=0)

        frame_detection = None
        # used the public provided detection (e.g. MOT17, HiEve)
        # the public detection needs to be ingested to DataSample
        # the ingested detection has been provided, find the details in readme/DATA.md
        if given_detection:
            frame_detection = given_detection.get_entities_for_frame_num(frame_id)
            frame_detection = convert_given_detections_to_boxlist(frame_detection,
                                                                  sample.width,
                                                                  sample.height)
            frame_height, frame_width = video_clip.shape[-2:]
            frame_detection = frame_detection.resize((frame_width, frame_height))
            frame_detection = [frame_detection.to(gpu_device)]

        with torch.no_grad():
            video_clip = video_clip.to(gpu_device)
            torch.cuda.synchronize()
            network_start_time = time.time()
            output_boxlists= model(video_clip, given_detection=frame_detection)
            torch.cuda.synchronize()
            network_time += time.time() - network_start_time

        # Resize to original image size and to xywh mode
        output_boxlists = [o.resize([sample.width, sample.height]).convert('xywh')
                           for o in output_boxlists]
        output_boxlists = [o.to(torch.device("cpu")) for o in output_boxlists]
        output_entities = boxlists_to_entities(output_boxlists, frame_id, timestamps)
        for entity in output_entities:
            sample_result.add_entity(entity)

    logger.info('Sample_id {} / Speed {} fps'.format(sample.id, len(sample) / (network_time)))

    return sample_result
Esempio n. 2
0
def sample_from_mot_csv(csv_path, fps, sample=None, mot17=True, has_gt=False):
    if sample is None:
        id_ = Path(csv_path).stem
        sample = DataSample(id_)
    else:
        sample = sample.get_copy_without_entities()
    with open(csv_path, newline='') as f:
        reader = csv.reader(f, delimiter=',')

        def coord(x):
            return round(float(x))

        for row in reader:
            frame_num = int(row[0])
            obj_id = row[1]
            x = coord(row[2])
            y = coord(row[3])
            w = coord(row[4])
            h = coord(row[5])
            conf = float(row[6])
            # If not mot17 the last 3 are 3D coords which are usually -1
            # (see pg. 9 https://arxiv.org/pdf/1504.01942.pdf)
            if has_gt and mot17:
                label = int(row[7])
                visibility = float(row[8])
            else:
                label = 1
                visibility = 1

            label_text = MOT_LABEL_MAP[label]

            # NOTE: Actually all classes that aren't Pedestrian have confidence 0 and so should be ingested
            # but are ignored at evaluation time
            # i.e. (label != 1 and conf) is never true
            assert not (label != 1 and conf)
            has_person_label = label_text in ("Pedestrian")

            time_ms = int((frame_num - 1) / fps * 1000)
            entity = AnnoEntity(time=time_ms, id=obj_id)
            entity.bbox = [x, y, w, h]
            blob = {
                "frame_csv": frame_num,
                "frame_idx": frame_num - 1,
                "visibility": visibility
            }
            entity.labels = {}
            # entity.labels["person"] = 1
            if has_person_label:
                entity.labels["person"] = 1
            else:
                entity.labels[str(label)] = 1
            entity.labels["vis"] = visibility

            entity.confidence = conf
            entity.blob = blob

            sample.add_entity(entity)
    return sample
Esempio n. 3
0
def eval_det_ap(gt: list, pred: dict, class_table=None, data_filter_fn=None, iou_threshold=[0.5]):
    """
    Evaluate the detection performance (COCO-style ap) on PoseTrack dataset
    :param gt: ground truth annotations for all videos
    :type gt: dict(vid_id: DataSample)
    :param pred: predictions for all videos
    :type pred: dict(vid_id: DataSample)
    :param data_filter_fn: a callable function that filters out detections that are not considered during evaluation
    :param class_table: class table specify the class order
    :param iou_threshold:
    :return: Average Precision (AP) over different thresholds
    """
    if class_table is None:
        class_table = ["person"]
    num_classes = len(class_table)

    all_scores = [[[] for _ in range(len(iou_threshold))] for _ in range(num_classes)]
    all_pr_ious = [[[] for _ in range(len(iou_threshold))] for _ in range(num_classes)]
    all_gt_ious = [[[] for _ in range(len(iou_threshold))] for _ in range(num_classes)]

    for (vid_id, vid_gt) in tqdm(gt):
        vid_pred = pred[vid_id]

        eval_frame_idxs = vid_gt.get_non_empty_frames()

        # Loop over all classes
        for class_id in range(0, num_classes):
            gt_class_entities = vid_gt.entities
            # gt_class_entities = vid_gt.get_entities_with_label(class_table[class_id])
            pred_class_entities = vid_pred.get_entities_with_label(class_table[class_id])

            # Wrap entities to a DataSample
            vid_class_gt = DataSample(vid_id, metadata=vid_gt.metadata)
            vid_class_pred = DataSample(vid_id, metadata=vid_pred.metadata)
            for _entity in gt_class_entities:
                vid_class_gt.add_entity(_entity)
            for _entity in pred_class_entities:
                vid_class_pred.add_entity(_entity)

            # Get AP for this class and video
            vid_class_scores, vid_class_pr_ious, vid_class_gt_ious = \
                get_ap(vid_class_gt, vid_class_pred, data_filter_fn, eval_frame_idxs, iou_threshold)

            for iou_id in range(len(iou_threshold)):
                all_scores[class_id][iou_id] += vid_class_scores[iou_id]
                all_pr_ious[class_id][iou_id] += vid_class_pr_ious[iou_id]
                all_gt_ious[class_id][iou_id] += vid_class_gt_ious[iou_id]

    class_ap_matrix = np.zeros((num_classes, len(iou_threshold)))
    for class_id in range(num_classes):
        class_ap_matrix[class_id, :] = compute_AP(all_scores[class_id],
                                                  all_pr_ious[class_id],
                                                  all_gt_ious[class_id])

    return class_ap_matrix
Esempio n. 4
0
    def _postprocess_tracks(self, tracks: DataSample):
        """
        post_process the tracks to filter out short and non-confident tracks
        :param tracks: un-filtered tracks
        :return: filtered tracks that would be used for evaluation
        """
        track_ids = set()
        for _entity in tracks.entities:
            if _entity.id not in track_ids and _entity.id >= 0:
                track_ids.add(_entity.id)

        filter_tracks = tracks.get_copy_without_entities()
        for _id in track_ids:
            _id_entities = tracks.get_entities_with_id(_id)
            _track_conf = np.mean([_e.confidence for _e in _id_entities])
            if len(_id_entities) >= self._track_len \
                    and _track_conf >= self._track_conf:
                for _entity in _id_entities:
                    filter_tracks.add_entity(_entity)
        return filter_tracks
Esempio n. 5
0
def get_ap(vid_class_gt: DataSample, vid_class_pred: DataSample, filter_fn, eval_frame_idxs, iou_thresh=[0.5]):
    """
    :param vid_class_gt: the ground truths for a specific class, in DataSample format
    :param vid_class_pred: the predictions for a specific class, in DataSample format
    :param filter_fn: a callable function to filter out detections
    :param eval_frame_idxs: the frame indexs where evaluation happens
    :param iou_thresh: the list of iou threshod that determines whether a detection is TP
    :returns
           vid_scores: the confidence for every predicted entity (a Python list)
           vid_pr_ious: the iou between the predicted entity and its matching gt entity (a Python list)
           vid_gt_ious: the iou between the gt entity and its matching predicted entity (a Python list)
    """
    if not isinstance(iou_thresh, list):
        iou_thresh = [iou_thresh]
    vid_scores = [[] for _ in iou_thresh]
    vid_pr_ious = [[] for _ in iou_thresh]
    vid_gt_ious = [[] for _ in iou_thresh]
    for frame_idx in eval_frame_idxs:

        gt_entities = vid_class_gt.get_entities_for_frame_num(frame_idx)
        pred_entities = vid_class_pred.get_entities_for_frame_num(frame_idx)

        # Remove detections for evaluation that are within ignore regions
        if filter_fn is not None:
            # Filter out ignored gt entities
            gt_entities, ignore_gt_entities = filter_fn(gt_entities, meta_data=vid_class_gt.metadata)
            # Filter out predicted entities that overlaps with ignored gt entities
            pred_entities, ignore_pred_entities = filter_fn(pred_entities, ignore_gt_entities)

        # sort the entity based on confidence scores
        pred_entities = sorted(pred_entities, key=lambda x: x.confidence, reverse=True)
        iou_matrix = bbs_iou(pred_entities, gt_entities)
        scores = [entity.confidence for entity in pred_entities]
        for i, _iou in enumerate(iou_thresh):
            # pred_ious, gt_ious = target_matching(pred_entities, gt_entities)
            pred_ious, gt_ious = greedy_matching(copy.deepcopy(iou_matrix), _iou)
            vid_scores[i] += scores
            vid_pr_ious[i] += pred_ious
            vid_gt_ious[i] += gt_ious

    return vid_scores, vid_pr_ious, vid_gt_ious
 def __init__(self, video: DataSample, clip_len=1, transforms=None):
     """
     Construct a data loader for inference
     :param video: a video stream in DataSample format
     :param clip_len: the length of video clips
     :param transforms: transform function for video pre-processing
     """
     self.video = video
     self.video_reader = video.get_data_reader()
     self.clip_len = clip_len
     self.transforms = transforms
     self.clip_idxs = list(range(0, len(self.video), self.clip_len))
Esempio n. 7
0
    def _inference_on_video(self, sample):
        cache_path = os.path.join(self._output_dir, '{}.json'.format(sample.id))
        os.makedirs(os.path.dirname(cache_path), exist_ok=True)

        if os.path.exists(cache_path):
            sample_result = DataSample.load(cache_path)
        else:
            given_detection = None
            if self._pub_detection:
                given_detection = self._pub_detection[sample.id]
            sample_result = do_inference(self._cfg, self._model, sample,
                                         transforms=self._transform,
                                         given_detection=given_detection
                                         )
            sample_result.dump(cache_path)
        return sample_result
Esempio n. 8
0
def main(args, description="Initial ingestion", det_options=None, mot17=True):
    if mot17:
        if det_options is not None and not all(x in DET_OPTIONS
                                               for x in det_options):
            raise ValueError("Det options were {} but must be only: {}".format(
                det_options, DET_OPTIONS))
        if det_options is None:
            det_options = DET_OPTIONS
    else:
        print("Ingesting MOT15, ignoring det options {}".format(det_options))
        det_options = [""]

    dataset_path = args.dataset_path
    out_filename = args.anno_name

    out_dataset = GluonCVMotionDataset(out_filename,
                                       dataset_path,
                                       load_anno=False)
    metadata = {
        FieldNames.DESCRIPTION: description,
        FieldNames.DATE_MODIFIED: str(datetime.datetime.now()),
    }
    out_dataset.metadata = metadata

    splits = {
        "train": os.path.join(out_dataset.data_root_path, "train"),
        "test": os.path.join(out_dataset.data_root_path,
                             "test"),  # No gt for MOT test
    }

    for det_option in det_options:
        for split_name, split_path in splits.items():
            subdirs = glob.glob(os.path.join(split_path, "*" + det_option))
            for i, subdir in enumerate(subdirs):
                vid_id = os.path.basename(subdir)
                vid_path = os.path.join(split_path, subdir)

                sample = DataSample(vid_id)

                if mot17:
                    info_path = os.path.join(vid_path, "seqinfo.ini")
                    config = configparser.ConfigParser()
                    config.read(info_path)
                    seq_conf = config["Sequence"]
                    fps = float(seq_conf['frameRate'])
                    num_frames = int(seq_conf['seqLength'])
                    width = int(seq_conf['imWidth'])
                    height = int(seq_conf['imHeight'])
                else:
                    # Assume 30 fps
                    fps = 30
                    im_paths = glob.glob(
                        os.path.join(vid_path, "img1", "*.jpg"))
                    num_frames = len(im_paths)
                    im_example = Image.open(im_paths[0])
                    width = im_example.width
                    height = im_example.height

                rel_base_dir = vid_path.replace(out_dataset.data_root_path,
                                                "").lstrip(os.path.sep)
                rel_base_dir = os.path.join(rel_base_dir, "img1")
                metadata = {
                    FieldNames.DATA_PATH: rel_base_dir,
                    FieldNames.FPS: fps,
                    FieldNames.NUM_FRAMES: num_frames,
                    FieldNames.RESOLUTION: {
                        "width": width,
                        "height": height
                    },
                }
                sample.metadata = metadata

                gt_path = os.path.join(vid_path, "gt/gt.txt")
                det_path = os.path.join(vid_path, "det/det.txt")
                has_gt = os.path.exists(gt_path)
                anno_path = gt_path if has_gt else det_path

                sample = sample_from_mot_csv(anno_path, fps, sample, mot17,
                                             has_gt)

                out_dataset.add_sample(sample)

                print("Done {} sample {}/{}, {}".format(
                    split_name, i + 1, len(subdirs), vid_id))

    out_dataset.dump()

    return out_dataset