def coco_evaluation(gts,
                    detections,
                    height,
                    width,
                    labelmap=("car", "pedestrian")):
    """Simple helper function wrapping around COCO's Python API
    
    Args:
        gts: iterable of numpy boxes for the ground truth
        detections: iterable of numpy boxes for the detections
        height (int): frame height 
        width (int): frame width
        labelmap (list): iterable of class labels
    """
    categories = [{
        "id": id + 1,
        "name": class_name,
        "supercategory": "none"
    } for id, class_name in enumerate(labelmap)]

    dataset, results = _to_coco_format(gts,
                                       detections,
                                       categories,
                                       height=height,
                                       width=width)

    coco_gt = COCO()
    coco_gt.dataset = dataset
    coco_gt.createIndex()
    coco_pred = coco_gt.loadRes(results) if len(results) else COCO()

    coco_eval = COCOeval(coco_gt, coco_pred, 'bbox')
    coco_eval.params.imgIds = np.arange(1, len(gts) + 1, dtype=int)
    coco_eval.evaluate()
    coco_eval.accumulate()
    coco_eval.stats = summarize(coco_eval)

    stats = {
        "mean_ap": coco_eval.stats[0],
        "mean_ap50": coco_eval.stats[1],
        "mean_ap75": coco_eval.stats[2],
        "mean_ap_small": coco_eval.stats[3],
        "mean_ap_medium": coco_eval.stats[4],
        "mean_ap_big": coco_eval.stats[5],
        "mean_ar": coco_eval.stats[8],
        "mean_ar_small": coco_eval.stats[9],
        "mean_ar_medium": coco_eval.stats[10],
        "mean_ar_big": coco_eval.stats[11]
    }
    return stats
Example #2
0
def coco_eval(gts,
              proposals,
              labelmap,
              height,
              width,
              tmp_path,
              epoch,
              dump=False):
    categories = [{
        "id": id + 1,
        "name": class_name,
        "supercategory": "none"
    } for id, class_name in enumerate(labelmap)]

    annotations = []
    results = []
    image_ids = []
    images = []
    box_type = np.float32

    # to dictionary
    for image_id, (gt, pred) in enumerate(zip(gts, proposals)):
        im_id = image_id + 1

        images.append({
            "date_captured": "2019",
            "file_name": "n.a",
            "id": im_id,
            "license": 1,
            "url": "",
            "height": height,
            "width": width
        })

        for i in range(len(gt)):
            bbox = gt[i]
            segmentation = []
            x1, y1, x2, y2 = bbox[:4].astype(box_type).tolist()
            w, h = (x2 - x1), (y2 - y1)
            area = w * h
            category_id = bbox[4]
            annotation = {
                "area": float(area),
                "iscrowd": False,
                "image_id": im_id,
                "bbox": [x1, y1, w, h],
                "category_id": int(category_id) + 1,
                "id": len(annotations) + 1
            }
            annotations.append(annotation)

        for i in range(len(pred)):
            bbox = pred[i, :4]

            x1, y1, x2, y2 = bbox[:4].astype(box_type).tolist()
            w, h = (x2 - x1), (y2 - y1)

            score = pred[i, 4]
            category_id = pred[i, 5]
            image_result = {
                'image_id': im_id,
                'category_id': int(category_id) + 1,
                'score': float(score),
                'bbox': [x1, y1, w, h],
            }
            results.append(image_result)

        image_ids.append(im_id)

    json_data = {
        "info": {},
        "licenses": [],
        "type": 'instances',
        "images": images,
        "annotations": annotations,
        "categories": categories
    }

    if len(results) == 0:
        results = [{
            'image_id': 1,
            'category_id': 1,
            'score': 0,
            'bbox': [0, 0, 0, 0]
        }]
    # Writing the file is time-consuming
    if dump:
        gt_filename = os.path.join(tmp_path, 'gt.json')
        result_filename = os.path.join(tmp_path, 'res.json')
        json.dump(json_data, open(gt_filename, 'w'), sort_keys=True, indent=4)
        json.dump(results, open(result_filename, 'w'), indent=4)
        coco_true = COCO(gt_filename)
        coco_pred = coco_true.loadRes(result_filename)
    else:
        coco_true = COCO()
        coco_true.dataset = json_data
        coco_true.createIndex()
        coco_pred = coco_true.loadRes(results)

    coco_eval = COCOeval(coco_true, coco_pred, 'bbox')
    coco_eval.params.imgIds = image_ids
    coco_eval.evaluate()
    coco_eval.accumulate()
    coco_eval.stats = summarize(coco_eval)

    stats = {
        "mean_ap": coco_eval.stats[0],
        "mean_ap50": coco_eval.stats[1],
        "mean_ap75": coco_eval.stats[2],
        "mean_ap_small": coco_eval.stats[3],
        "mean_ap_medium": coco_eval.stats[4],
        "mean_ap_big": coco_eval.stats[5],
        "mean_ar": coco_eval.stats[8],
        "mean_ar_small": coco_eval.stats[9],
        "mean_ar_medium": coco_eval.stats[10],
        "mean_ar_big": coco_eval.stats[11]
    }
    return stats
Example #3
0
def evaluate_coco(coco_gt, coco_dt, parameters, catIds=None):
    LOG = getLogger('processor.EvaluateSegmentation')
    LOG.info("comparing segmentations")
    stats = dict(parameters)
    coco_eval = COCOeval(coco_gt, coco_dt, 'segm')  # bbox
    if catIds:
        coco_eval.params.catIds = catIds
    #coco_eval.params.iouThrs = [.5:.05:.95]
    #coco_eval.params.iouThrs = np.linspace(.3, .95, 14)
    coco_eval.params.maxDets = [
        None
    ]  # unlimited nr of detections (requires pycocotools#559)
    #coco_eval.params.areaRng = [(0, np.inf)] # unlimited region size
    #coco_eval.params.areaRngLbl = ['all'] # unlimited region size
    # Note: The IoU threshold criterion is inadequate for flat segmentation,
    #       because over-/undersegmentation can quickly become false negative/positive.
    #       The pycocotools implementation is especially inadequate, because
    #       it only counts 1:1 matches (and not even the largest or best-scoring, #564).
    #       On the other hand, purely pixel-wise measures do not distinguish instances,
    #       i.e. neighbours can quickly become merged or instances torn apart.
    #       Our approach therefore does not build on pycocotools for matching
    #       and aggregation, only for fast IoU calculation. All non-zero pairs
    #       are considered matches if their intersection over union > 0.5 _or_
    #       their intersection over either side > 0.5. Matches can thus be n:m.
    #       Non-matches are counted as well (false positives and false negatives).
    #       Aggregation uses microaveraging over images. Besides counting segments,
    #       the pixel areas are counted and averaged (as ratios).
    # FIXME: We must differentiate between allowable and non-allowable over/under-segmentation (splits/merges).
    #        (A region's split is allowable if it flows in the textLineOrder of the respective GT,
    #         i.e. lines are likely to be either on one side or the other, but not both.
    #         For top-to-bottom/bottom-to-top regions, vertical splits are allowable.
    #         For left-to-right/right-to-left regions, horizontal splits are allowable.
    #         To be sure, we could also validate that explicitly – evaluating both levels at the same time.
    #         Analogously, a number of regions' merge is allowable if it flows in the textLineOrder
    #         of them all, and the GT global reading order has no other regions in between.
    #         For top-to-bottom/bottom-to-top regions, vertical merges are allowable.
    #         For left-to-right/right-to-left regions, horizontal merges are allowable.
    #         Again, we could also validate that the overall textline flow is equivalent.)
    #        This difference can in turn be used to weigh a match pair's score accordingly
    #        when aggregating. For precision-like scores, we would rule out non-allowable merges
    #        (by counting them as FP), and for recall-like scores, we would rule out non-allowable splits
    #        (by counting them as FN).
    #        We can also weigh these non-allowable cases by their share of height
    #        (in vertical textLineOrder and horizontal writing) or width
    #        (in horizontal textLineOrder and vertical writing) which is in disagreement,
    #        or the share of its textlines that have been split or lost.
    #        Furthermore, we can weigh matches by the share of non-text regions or fg pixels involved.
    coco_eval.evaluate()

    # get by-page alignment (ignoring inadequate 1:1 matching by pycocotools)
    def get(arg):
        return lambda x: x[arg]

    numImgs = len(coco_eval.params.imgIds)
    numAreas = len(coco_eval.params.areaRng)
    for imgind, imgId in enumerate(coco_eval.params.imgIds):
        img = coco_gt.imgs[imgId]
        pageId = img['file_name']
        for catind, catId in enumerate(coco_eval.params.catIds):
            cat = coco_gt.cats[catId]
            catName = cat['name']
            if not catId:
                continue
            # bypassing COCOeval.evaluateImg, hook onto its results
            # (again, we stay at areaRng[0]=all and maxDets[0]=all)
            start = catind * numImgs * numAreas
            evalimg = coco_eval.evalImgs[start + imgind]
            if evalimg is None:
                continue  # no DT and GT here
            # record as dict by pageId / by category
            imgstats = stats.setdefault('by-image', dict())
            pagestats = imgstats.setdefault(pageId, dict())
            # get matches and ious and scores
            ious = coco_eval.ious[imgId, catId]
            if len(ious):
                overlaps_dt, overlaps_gt = ious.nonzero()
            else:
                overlaps_dt = overlaps_gt = []
            # reconstruct score sorting in computeIoU
            gt = coco_eval._gts[imgId, catId]
            dt = coco_eval._dts[imgId, catId]
            dtind = np.argsort([-d['score'] for d in dt], kind='mergesort')
            dt = [dt[i] for i in dtind]
            matches = list()
            gtmatches = dict()
            dtmatches = dict()
            for dtind, gtind in zip(overlaps_dt, overlaps_gt):
                d = dt[dtind]
                g = gt[gtind]
                iou = ious[dtind, gtind]
                union = maskArea(
                    mergeMasks([g['segmentation'], d['segmentation']]))
                intersection = int(iou * union)
                # cannot use g or d['area'] here, because mask might be fractional (only-fg) instead of outline
                areag = int(maskArea(g['segmentation']))
                aread = int(maskArea(d['segmentation']))
                iogt = intersection / areag
                iodt = intersection / aread
                if iou < 0.5 and iogt < 0.5 and iodt < 0.5:
                    continue
                gtmatches.setdefault(gtind, list()).append(dtind)
                dtmatches.setdefault(dtind, list()).append(gtind)
                matches.append(
                    (g['id'], d['id'], iogt, iodt, iou, intersection))
                pagestats.setdefault('true_positives',
                                     dict()).setdefault(catName,
                                                        list()).append({
                                                            'GT.ID':
                                                            g['segment_id'],
                                                            'DT.ID':
                                                            d['segment_id'],
                                                            'GT.area':
                                                            areag,
                                                            'DT.area':
                                                            aread,
                                                            'I.area':
                                                            intersection,
                                                            'IoGT':
                                                            iogt,
                                                            'IoDT':
                                                            iodt,
                                                            'IoU':
                                                            iou
                                                        })
            dtmisses = []
            for dtind, d in enumerate(dt):
                if dtind in dtmatches:
                    continue
                dtmisses.append((d['id'], maskArea(d['segmentation'])))
                pagestats.setdefault('false_positives',
                                     dict()).setdefault(catName,
                                                        list()).append({
                                                            'DT.ID':
                                                            d['segment_id'],
                                                            'area':
                                                            int(d['area'])
                                                        })
            gtmisses = []
            for gtind, g in enumerate(gt):
                if gtind in gtmatches:
                    continue
                gtmisses.append((g['id'], maskArea(g['segmentation'])))
                pagestats.setdefault('false_negatives',
                                     dict()).setdefault(catName,
                                                        list()).append({
                                                            'GT.ID':
                                                            g['segment_id'],
                                                            'area':
                                                            int(g['area'])
                                                        })
            # measure under/oversegmentation for this image and category
            # (follows Zhang et al 2021: Rethinking Semantic Segmentation Evaluation [arXiv:2101.08418])
            over_gt = set(gtind for gtind in gtmatches
                          if len(gtmatches[gtind]) > 1)
            over_dt = set(
                chain.from_iterable(gtmatches[gtind] for gtind in gtmatches
                                    if len(gtmatches[gtind]) > 1))
            under_dt = set(dtind for dtind in dtmatches
                           if len(dtmatches[dtind]) > 1)
            under_gt = set(
                chain.from_iterable(dtmatches[dtind] for dtind in dtmatches
                                    if len(dtmatches[dtind]) > 1))
            over_degree = sum(len(gtmatches[gtind]) - 1 for gtind in gtmatches)
            under_degree = sum(
                len(dtmatches[dtind]) - 1 for dtind in dtmatches)
            if len(dt) and len(gt):
                oversegmentation = len(over_gt) * len(over_dt) / len(gt) / len(
                    dt)
                undersegmentation = len(under_gt) * len(under_dt) / len(
                    gt) / len(dt)
                # Zhang's idea of attenuating the under/oversegmentation ratio with a "penalty"
                # to account for the degree of further sub-segmentation is misguided IMHO,
                # because its degree term depends on the total number of segments:
                # oversegmentation = np.tanh(oversegmentation * over_degree)
                # undersegmentation = np.tanh(undersegmentation * under_degree)
                pagestats.setdefault('oversegmentation',
                                     dict())[catName] = oversegmentation
                pagestats.setdefault('undersegmentation',
                                     dict())[catName] = undersegmentation
                pagestats.setdefault(
                    'precision',
                    dict())[catName] = (len(dt) - len(dtmisses)) / len(dt)
                pagestats.setdefault(
                    'recall',
                    dict())[catName] = (len(gt) - len(gtmisses)) / len(gt)
            tparea = sum(map(get(5), matches))  # sum(inter)
            fparea = sum(map(get(1), dtmisses))  # sum(area)
            fnarea = sum(map(get(1), gtmisses))  # sum(area)
            if tparea or (fparea and fnarea):
                pagestats.setdefault(
                    'pixel_precision',
                    dict())[catName] = tparea / (tparea + fparea)
                pagestats.setdefault(
                    'pixel_recall',
                    dict())[catName] = tparea / (tparea + fnarea)
                pagestats.setdefault(
                    'pixel_iou',
                    dict())[catName] = tparea / (tparea + fparea + fnarea)
            # aggregate per-img/per-cat IoUs for microaveraging
            evalimg['matches'] = matches  # TP
            evalimg['dtMisses'] = dtmisses  # FP
            evalimg['gtMisses'] = gtmisses  # FN
            evalimg['dtIdsOver'] = [dt[dtind]['id'] for dtind in over_dt]
            evalimg['gtIdsOver'] = [gt[gtind]['id'] for gtind in over_gt]
            evalimg['dtIdsUnder'] = [dt[dtind]['id'] for dtind in under_dt]
            evalimg['gtIdsUnder'] = [gt[gtind]['id'] for gtind in under_gt]

    catstats = stats.setdefault('by-category', dict())
    # accumulate our over-/undersegmentation and IoU ratios
    numImgs = len(coco_eval.params.imgIds)
    numAreas = len(coco_eval.params.areaRng)
    for catind, catId in enumerate(coco_eval.params.catIds):
        cat = coco_gt.cats[catId]
        catstats.setdefault(cat['name'], dict())
        start = catind * numImgs * numAreas
        # again, we stay at areaRng[0]=all and maxDets[0]=all
        evalimgs = [
            coco_eval.evalImgs[start + imgind] for imgind in range(numImgs)
        ]
        evalimgs = [img for img in evalimgs if img is not None]
        assert all(img['category_id'] == catId for img in evalimgs)
        assert all(img['maxDet'] is None for img in evalimgs)
        assert all(img['aRng'] == coco_eval.params.areaRng[0]
                   for img in evalimgs)
        if not len(evalimgs):
            continue
        # again, we can ignore gtIgnore here, because we only look at areaRng[0]=all
        # again, we can ignore dtIgnore here, because we only look at maxDet=None
        numDTs = sum(len(img['dtIds']) for img in evalimgs)
        numGTs = sum(len(img['gtIds']) for img in evalimgs)
        overDTs = sum(len(img['dtIdsOver']) for img in evalimgs)
        overGTs = sum(len(img['gtIdsOver']) for img in evalimgs)
        underDTs = sum(len(img['dtIdsUnder']) for img in evalimgs)
        underGTs = sum(len(img['gtIdsUnder']) for img in evalimgs)
        numIoUs = sum(len(img['matches']) for img in evalimgs)
        numFPs = sum(len(img['dtMisses']) for img in evalimgs)
        numFNs = sum(len(img['gtMisses']) for img in evalimgs)
        sumIoUs = sum(sum(map(get(4), img['matches']))
                      for img in evalimgs)  # sum(iou)
        sumIoGTs = sum(sum(map(get(2), img['matches']))
                       for img in evalimgs)  # sum(iogt)
        sumIoDTs = sum(sum(map(get(3), img['matches']))
                       for img in evalimgs)  # sum(iodt)
        sumTParea = sum(sum(map(get(5), img['matches']))
                        for img in evalimgs)  # sum(inter)
        sumFParea = sum(sum(map(get(1), img['dtMisses']))
                        for img in evalimgs)  # sum(area)
        sumFNarea = sum(sum(map(get(1), img['gtMisses']))
                        for img in evalimgs)  # sum(area)
        if numDTs and numGTs:
            oversegmentation = overDTs * overGTs / numDTs / numGTs
            undersegmentation = underDTs * underGTs / numDTs / numGTs
            precision = (numDTs - numFPs) / numDTs
            recall = (numGTs - numFNs) / numGTs
        else:
            oversegmentation = undersegmentation = precision = recall = -1
        if numIoUs:
            iou = sumIoUs / numIoUs
            iogt = sumIoGTs / numIoUs
            iodt = sumIoDTs / numIoUs
        else:
            iou = iogt = iodt = -1
        if sumTParea or (sumFParea and sumFNarea):
            pixel_precision = sumTParea / (sumTParea + sumFParea)
            pixel_recall = sumTParea / (sumTParea + sumFNarea)
            pixel_iou = sumTParea / (sumTParea + sumFParea + sumFNarea)
        else:
            pixel_precision = pixel_recall = pixel_iou = -1
        catstats[cat['name']]['oversegmentation'] = oversegmentation
        catstats[cat['name']]['undersegmentation'] = undersegmentation
        catstats[cat['name']]['segment-precision'] = precision
        catstats[cat['name']]['segment-recall'] = recall
        catstats[cat['name']]['IoGT'] = iogt  # i.e. per-match pixel-recall
        catstats[cat['name']]['IoDT'] = iodt  # i.e. per-match pixel-precision
        catstats[cat['name']]['IoU'] = iou  # i.e. per-match pixel-jaccardindex
        catstats[cat['name']]['pixel-precision'] = pixel_precision
        catstats[cat['name']]['pixel-recall'] = pixel_recall
        catstats[cat['name']]['pixel-iou'] = pixel_iou

    coco_eval.accumulate()
    coco_eval.summarize()
    statInds = np.ones(12, np.bool)
    statInds[7] = False  # AR maxDet[1]
    statInds[8] = False  # AR maxDet[2]
    coco_eval.stats = coco_eval.stats[statInds]
    stats['scores'] = dict(
        zip([
            'Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all ]',
            'Average Precision  (AP) @[ IoU=0.50      | area=   all ]',
            'Average Precision  (AP) @[ IoU=0.75      | area=   all ]',
            'Average Precision  (AP) @[ IoU=0.50:0.95 | area= small ]',
            'Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium ]',
            'Average Precision  (AP) @[ IoU=0.50:0.95 | area= large ]',
            'Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all ]',
            'Average Recall     (AR) @[ IoU=0.50:0.95 | area= small ]',
            'Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium ]',
            'Average Recall     (AR) @[ IoU=0.50:0.95 | area= large ]',
        ], coco_eval.stats.tolist()))
    return stats
Example #4
0
def _summarize_coco(cocoeval: COCOeval):
    """
    Compute and display summary metrics for evaluation results.
    Note this functin can *only* be applied on the default parameter setting
    """
    def _summarize(ap=1, iouThr=None, areaRng='all', maxDets=100):
        p = cocoeval.params
        iStr = ' {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}'
        titleStr = 'Average Precision' if ap == 1 else 'Average Recall'
        typeStr = '(AP)' if ap == 1 else '(AR)'
        iouStr = '{:0.2f}:{:0.2f}'.format(p.iouThrs[0], p.iouThrs[-1]) \
            if iouThr is None else '{:0.2f}'.format(iouThr)

        aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng]
        mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets]
        if ap == 1:
            # dimension of precision: [TxRxKxAxM]
            s = cocoeval.eval['precision']
            # IoU
            if iouThr is not None:
                t = np.where(iouThr == p.iouThrs)[0]
                s = s[t]
            s = s[:, :, :, aind, mind]
        else:
            # dimension of recall: [TxKxAxM]
            s = cocoeval.eval['recall']
            if iouThr is not None:
                t = np.where(iouThr == p.iouThrs)[0]
                s = s[t]
            s = s[:, :, aind, mind]
        if len(s[s > -1]) == 0:
            mean_s = -1
        else:
            mean_s = np.mean(s[s > -1])
        print(iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, mean_s))
        return mean_s

    def _summarizeDets():
        stats = np.zeros((12, ))
        stats[0] = _summarize(1, maxDets=cocoeval.params.maxDets[1])
        stats[1] = _summarize(1, iouThr=.5, maxDets=cocoeval.params.maxDets[1])
        stats[2] = _summarize(1,
                              iouThr=.75,
                              maxDets=cocoeval.params.maxDets[1])
        stats[3] = _summarize(1,
                              areaRng='small',
                              maxDets=cocoeval.params.maxDets[1])
        stats[4] = _summarize(1,
                              areaRng='medium',
                              maxDets=cocoeval.params.maxDets[1])
        stats[5] = _summarize(1,
                              areaRng='large',
                              maxDets=cocoeval.params.maxDets[1])
        stats[6] = _summarize(0, maxDets=cocoeval.params.maxDets[0])
        stats[9] = _summarize(0,
                              areaRng='small',
                              maxDets=cocoeval.params.maxDets[0])
        stats[10] = _summarize(0,
                               areaRng='medium',
                               maxDets=cocoeval.params.maxDets[0])
        stats[11] = _summarize(0,
                               areaRng='large',
                               maxDets=cocoeval.params.maxDets[0])
        return stats

    def _summarizeKps():
        stats = np.zeros((10, ))
        stats[0] = _summarize(1, maxDets=20)
        stats[1] = _summarize(1, maxDets=20, iouThr=.5)
        stats[2] = _summarize(1, maxDets=20, iouThr=.75)
        stats[3] = _summarize(1, maxDets=20, areaRng='medium')
        stats[4] = _summarize(1, maxDets=20, areaRng='large')
        stats[5] = _summarize(0, maxDets=20)
        stats[6] = _summarize(0, maxDets=20, iouThr=.5)
        stats[7] = _summarize(0, maxDets=20, iouThr=.75)
        stats[8] = _summarize(0, maxDets=20, areaRng='medium')
        stats[9] = _summarize(0, maxDets=20, areaRng='large')
        return stats

    if not cocoeval.eval:
        raise Exception('Please run accumulate() first')

    iouType = cocoeval.params.iouType

    if iouType == 'segm' or iouType == 'bbox':
        summarize = _summarizeDets
    elif iouType == 'keypoints':
        summarize = _summarizeKps
    cocoeval.stats = summarize()
Example #5
0
def evaluate_coco(
    dataset_path,
    result_path,
    metric="bbox",
    classwise=False,
    proposal_nums=(10, 100, 500),
    iou_thrs=None,
    metric_items=None,
    out_dir=None,
):
    """Evaluation in COCO protocol.
    Args:
        dataset_path (str): COCO dataset json path.
        result_path (str): COCO result json path.
        metric (str | list[str]): Metrics to be evaluated. Options are
            'bbox', 'segm', 'proposal'.
        classwise (bool): Whether to evaluating the AP for each class.
        proposal_nums (Sequence[int]): Proposal number used for evaluating
            recalls, such as recall@100, recall@500.
            Default: (10, 100, 500).
        iou_thrs (Sequence[float], optional): IoU threshold used for
            evaluating recalls/mAPs. If set to a list, the average of all
            IoUs will also be computed. If not specified, [0.50, 0.55,
            0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95] will be used.
            Default: None.
        metric_items (list[str] | str, optional): Metric items that will
            be returned. If not specified, ``['AR@10', 'AR@100',
            'AR@500', 'AR_s@500', 'AR_m@500', 'AR_l@500' ]`` will be
            used when ``metric=='proposal'``, ``['mAP', 'mAP50', 'mAP75',
            'mAP_s', 'mAP_m', 'mAP_l', 'mAP50_s', 'mAP50_m', 'mAP50_l']``
            will be used when ``metric=='bbox' or metric=='segm'``.
        out_dir (str): Directory to save evaluation result json.
    Returns:
        dict[str, float]: COCO style evaluation metric.
    """

    metrics = metric if isinstance(metric, list) else [metric]
    allowed_metrics = ["bbox", "segm", "proposal"]
    for metric in metrics:
        if metric not in allowed_metrics:
            raise KeyError(f"metric {metric} is not supported")
    if iou_thrs is None:
        iou_thrs = np.linspace(0.5, 0.95, int(np.round((0.95 - 0.5) / 0.05)) + 1, endpoint=True)
    if metric_items is not None:
        if not isinstance(metric_items, list):
            metric_items = [metric_items]

    eval_results = OrderedDict()
    cocoGt = COCO(dataset_path)
    cat_ids = list(cocoGt.cats.keys())
    for metric in metrics:
        msg = f"Evaluating {metric}..."
        msg = "\n" + msg
        print(msg)

        iou_type = metric
        with open(result_path) as json_file:
            results = json.load(json_file)
        try:
            if iou_type == "segm":
                # Refer to https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/coco.py#L331  # noqa
                # When evaluating mask AP, if the results contain bbox,
                # cocoapi will use the box area instead of the mask area
                # for calculating the instance area. Though the overall AP
                # is not affected, this leads to different
                # small/medium/large mask AP results.
                for x in results:
                    x.pop("bbox")
                warnings.simplefilter("once")
                warnings.warn(
                    'The key "bbox" is deleted for more accurate mask AP '
                    "of small/medium/large instances since v2.12.0. This "
                    "does not change the overall mAP calculation.",
                    UserWarning,
                )
            cocoDt = cocoGt.loadRes(results)
        except IndexError:
            print("The testing results of the whole dataset is empty.")
            break

        cocoEval = COCOeval(cocoGt, cocoDt, iou_type)
        cocoEval.params.catIds = cat_ids
        cocoEval.params.maxDets = list(proposal_nums)
        cocoEval.params.iouThrs = iou_thrs
        # mapping of cocoEval.stats
        coco_metric_names = {
            "mAP": 0,
            "mAP50": 1,
            "mAP75": 2,
            "mAP_s": 3,
            "mAP_m": 4,
            "mAP_l": 5,
            "AR@10": 6,
            "AR@100": 7,
            "AR@500": 8,
            "AR_s@500": 9,
            "AR_m@500": 10,
            "AR_l@500": 11,
            "mAP50_s": 12,
            "mAP50_m": 13,
            "mAP50_l": 14,
        }
        if metric_items is not None:
            for metric_item in metric_items:
                if metric_item not in coco_metric_names:
                    raise KeyError(f"metric item {metric_item} is not supported")
        if metric == "proposal":
            cocoEval.params.useCats = 0
            cocoEval.evaluate()
            cocoEval.accumulate()
            cocoEval.summarize()
            if metric_items is None:
                metric_items = ["AR@10", "AR@100", "AR@500", "AR_s@500", "AR_m@500", "AR_l@500"]

            for item in metric_items:
                val = float(f"{cocoEval.stats[coco_metric_names[item]]:.3f}")
                eval_results[item] = val
        else:
            cocoEval.evaluate()
            cocoEval.accumulate()
            cocoEval.summarize()
            # calculate mAP50_s/m/l
            mAP50_s = _cocoeval_summarize(
                cocoEval, ap=1, iouThr=0.5, areaRng="small", maxDets=cocoEval.params.maxDets[-1]
            )
            mAP50_m = _cocoeval_summarize(
                cocoEval, ap=1, iouThr=0.5, areaRng="medium", maxDets=cocoEval.params.maxDets[-1]
            )
            mAP50_l = _cocoeval_summarize(
                cocoEval, ap=1, iouThr=0.5, areaRng="large", maxDets=cocoEval.params.maxDets[-1]
            )
            cocoEval.stats = np.append(cocoEval.stats, [mAP50_s, mAP50_m, mAP50_l], 0)

            if classwise:  # Compute per-category AP
                # Compute per-category AP
                # from https://github.com/facebookresearch/detectron2/
                precisions = cocoEval.eval["precision"]
                # precision: (iou, recall, cls, area range, max dets)
                assert len(cat_ids) == precisions.shape[2]

                max_cat_name_len = 0
                for idx, catId in enumerate(cat_ids):
                    nm = cocoGt.loadCats(catId)[0]
                    cat_name_len = len(nm["name"])
                    max_cat_name_len = cat_name_len if cat_name_len > max_cat_name_len else max_cat_name_len

                results_per_category = []
                for idx, catId in enumerate(cat_ids):
                    # area range index 0: all area ranges
                    # max dets index -1: typically 100 per image
                    nm = cocoGt.loadCats(catId)[0]
                    ap = _cocoeval_summarize(
                        cocoEval,
                        ap=1,
                        catIdx=idx,
                        areaRng="all",
                        maxDets=cocoEval.params.maxDets[-1],
                        catName=nm["name"],
                        nameStrLen=max_cat_name_len,
                    )
                    ap_s = _cocoeval_summarize(
                        cocoEval,
                        ap=1,
                        catIdx=idx,
                        areaRng="small",
                        maxDets=cocoEval.params.maxDets[-1],
                        catName=nm["name"],
                        nameStrLen=max_cat_name_len,
                    )
                    ap_m = _cocoeval_summarize(
                        cocoEval,
                        ap=1,
                        catIdx=idx,
                        areaRng="medium",
                        maxDets=cocoEval.params.maxDets[-1],
                        catName=nm["name"],
                        nameStrLen=max_cat_name_len,
                    )
                    ap_l = _cocoeval_summarize(
                        cocoEval,
                        ap=1,
                        catIdx=idx,
                        areaRng="large",
                        maxDets=cocoEval.params.maxDets[-1],
                        catName=nm["name"],
                        nameStrLen=max_cat_name_len,
                    )
                    ap50 = _cocoeval_summarize(
                        cocoEval,
                        ap=1,
                        iouThr=0.5,
                        catIdx=idx,
                        areaRng="all",
                        maxDets=cocoEval.params.maxDets[-1],
                        catName=nm["name"],
                        nameStrLen=max_cat_name_len,
                    )
                    ap50_s = _cocoeval_summarize(
                        cocoEval,
                        ap=1,
                        iouThr=0.5,
                        catIdx=idx,
                        areaRng="small",
                        maxDets=cocoEval.params.maxDets[-1],
                        catName=nm["name"],
                        nameStrLen=max_cat_name_len,
                    )
                    ap50_m = _cocoeval_summarize(
                        cocoEval,
                        ap=1,
                        iouThr=0.5,
                        catIdx=idx,
                        areaRng="medium",
                        maxDets=cocoEval.params.maxDets[-1],
                        catName=nm["name"],
                        nameStrLen=max_cat_name_len,
                    )
                    ap50_l = _cocoeval_summarize(
                        cocoEval,
                        ap=1,
                        iouThr=0.5,
                        catIdx=idx,
                        areaRng="large",
                        maxDets=cocoEval.params.maxDets[-1],
                        catName=nm["name"],
                        nameStrLen=max_cat_name_len,
                    )
                    results_per_category.append((f'{metric}_{nm["name"]}_mAP', f"{float(ap):0.3f}"))
                    results_per_category.append((f'{metric}_{nm["name"]}_mAP_s', f"{float(ap_s):0.3f}"))
                    results_per_category.append((f'{metric}_{nm["name"]}_mAP_m', f"{float(ap_m):0.3f}"))
                    results_per_category.append((f'{metric}_{nm["name"]}_mAP_l', f"{float(ap_l):0.3f}"))
                    results_per_category.append((f'{metric}_{nm["name"]}_mAP50', f"{float(ap50):0.3f}"))
                    results_per_category.append((f'{metric}_{nm["name"]}_mAP50_s', f"{float(ap50_s):0.3f}"))
                    results_per_category.append((f'{metric}_{nm["name"]}_mAP50_m', f"{float(ap50_m):0.3f}"))
                    results_per_category.append((f'{metric}_{nm["name"]}_mAP50_l', f"{float(ap50_l):0.3f}"))

                num_columns = min(6, len(results_per_category) * 2)
                results_flatten = list(itertools.chain(*results_per_category))
                headers = ["category", "AP"] * (num_columns // 2)
                results_2d = itertools.zip_longest(*[results_flatten[i::num_columns] for i in range(num_columns)])
                table_data = [headers]
                table_data += [result for result in results_2d]
                table = AsciiTable(table_data)
                print("\n" + table.table)

            if metric_items is None:
                metric_items = ["mAP", "mAP50", "mAP75", "mAP_s", "mAP_m", "mAP_l", "mAP50_s", "mAP50_m", "mAP50_l"]

            for metric_item in metric_items:
                key = f"{metric}_{metric_item}"
                val = float(f"{cocoEval.stats[coco_metric_names[metric_item]]:.3f}")
                eval_results[key] = val
            ap = cocoEval.stats[:6]
            eval_results[f"{metric}_mAP_copypaste"] = (
                f"{ap[0]:.3f} {ap[1]:.3f} {ap[2]:.3f} {ap[3]:.3f} " f"{ap[4]:.3f} {ap[5]:.3f}"
            )
            if classwise:
                eval_results["results_per_category"] = {key: value for key, value in results_per_category}
    # set save path
    if not out_dir:
        out_dir = Path(result_path).parent
    save_path = str(out_dir / "eval.json")
    # export as json
    with open(save_path, "w", encoding="utf-8") as outfile:
        json.dump(eval_results, outfile, indent=4, separators=(",", ":"))
    return eval_results