Beispiel #1
0
    def demo(cls, n=10, p_true=0.5, p_error=0.2, rng=None):
        """
        Create random data for tests

        Example:
            >>> cfsn = BinaryConfusionVectors.demo(n=1000, p_error=0.1)
            >>> print(cfsn.data._pandas())
            >>> roc_info = cfsn.roc()
            >>> pr_info = cfsn.precision_recall()
            >>> print('roc_info = {!r}'.format(roc_info))
            >>> print('pr_info = {!r}'.format(pr_info))
            >>> # xdoctest: +REQUIRES(--show)
            >>> import kwplot
            >>> kwplot.autompl()
            >>> kwplot.figure(fnum=1, pnum=(1, 2, 1))
            >>> pr_info.draw()
            >>> kwplot.figure(fnum=1, pnum=(1, 2, 2))
            >>> roc_info.draw()
        """
        import kwarray
        rng = kwarray.ensure_rng(rng)
        score = rng.rand(n)

        data = kwarray.DataFrameArray({
            'is_true': (score > p_true).astype(np.uint8),
            'pred_score':
            score,
        })

        flags = rng.rand(n) < p_error
        data['is_true'][flags] = 1 - data['is_true'][flags]

        classes = ['c1', 'c2', 'c3']
        self = cls(data, cx=1, classes=classes)
        return self
Beispiel #2
0
    def binarize_peritem(cfsn_vecs, negative_classes=None):
        """
        Creates a binary representation useful for measuring the performance of
        detectors. It is assumed that scores of "positive" classes should be
        high and "negative" clases should be low.

        Args:
            negative_classes (List[str | int]): list of negative class names or
                idxs, by default chooses any class with a true class index of
                -1. These classes should ideally have low scores.

        Example:
            >>> # xdoctest: +REQUIRES(module:ndsampler)
            >>> from netharn.metrics import DetectionMetrics
            >>> dmet = DetectionMetrics.demo(
            >>>     nimgs=10, nboxes=(0, 10), n_fp=(0, 1), nclasses=3)
            >>> cfsn_vecs = dmet.confusion_vectors()
            >>> class_idxs = list(dmet.classes.node_to_idx.values())
            >>> binvecs = cfsn_vecs.binarize_peritem()
        """
        import kwarray
        # import warnings
        # warnings.warn('binarize_peritem DOES NOT PRODUCE CORRECT RESULTS')

        negative_cidxs = {-1}
        if negative_classes is not None:

            @ub.memoize
            def _lower_classes():
                if cfsn_vecs.classes is None:
                    raise Exception(
                        'classes must be known if negative_classes are strings'
                    )
                return [c.lower() for c in cfsn_vecs.classes]

            for c in negative_classes:
                import six
                if isinstance(c, six.string_types):
                    classes = _lower_classes()
                    try:
                        cidx = classes.index(c)
                    except Exception:
                        continue
                else:
                    cidx = int(c)
                negative_cidxs.add(cidx)

        is_false = kwarray.isect_flags(cfsn_vecs.data['true'], negative_cidxs)

        _data = {
            'is_true': ~is_false,
            'pred_score': cfsn_vecs.data['score'],
        }
        extra = ub.dict_isect(_data, ['txs', 'pxs', 'gid', 'weight'])
        _data.update(extra)
        bin_data = kwarray.DataFrameArray(_data)
        binvecs = BinaryConfusionVectors(bin_data)
        return binvecs
Beispiel #3
0
    def from_arrays(ConfusionVectors,
                    true,
                    pred=None,
                    score=None,
                    weight=None,
                    probs=None,
                    classes=None):
        """
        Construct confusion vector data structure from component arrays

        Example:
            >>> # xdoctest: +REQUIRES(module:ndsampler)
            >>> import kwarray
            >>> classes = ['person', 'vehicle', 'object']
            >>> rng = kwarray.ensure_rng(0)
            >>> true = (rng.rand(10) * len(classes)).astype(np.int)
            >>> probs = rng.rand(len(true), len(classes))
            >>> cfsn_vecs = ConfusionVectors.from_arrays(true=true, probs=probs, classes=classes)
            >>> cfsn_vecs.confusion_matrix()
            pred     person  vehicle  object
            real
            person        0        0       0
            vehicle       2        4       1
            object        2        1       0
        """
        import kwarray
        if pred is None:
            if probs is not None:
                import ndsampler
                if isinstance(classes, ndsampler.CategoryTree):
                    if not classes.is_mutex():
                        raise Exception(
                            'Graph categories require explicit pred')
                # We can assume all classes are mutually exclusive here
                pred = probs.argmax(axis=1)
            else:
                raise ValueError('Must specify pred (or probs)')

        data = {
            'true': true,
            'pred': pred,
            'score': score,
            'weight': weight,
        }

        data = {k: v for k, v in data.items() if v is not None}
        cfsn_data = kwarray.DataFrameArray(data)
        cfsn_vecs = ConfusionVectors(cfsn_data, probs=probs, classes=classes)
        return cfsn_vecs
Beispiel #4
0
    def coarsen(cfsn_vecs, cxs):
        """
        Creates a coarsened set of vectors
        """
        import ndsampler
        import kwarray
        assert cfsn_vecs.probs is not None, 'need probs'
        if not isinstance(cfsn_vecs.classes, ndsampler.CategoryTree):
            raise TypeError('classes must be a ndsampler.CategoryTree')

        descendent_map = cfsn_vecs.classes.idx_to_descendants_idxs(
            include_cfsn_vecs=True)
        valid_descendant_mapping = ub.dict_isect(descendent_map, cxs)
        # mapping from current category indexes to the new coarse ones
        # Anything without an explicit key will be mapped to background

        bg_idx = cfsn_vecs.classes.index('background')
        mapping = {
            v: k
            for k, vs in valid_descendant_mapping.items() for v in vs
        }
        new_true = np.array(
            [mapping.get(x, bg_idx) for x in cfsn_vecs.data['true']])
        new_pred = np.array(
            [mapping.get(x, bg_idx) for x in cfsn_vecs.data['pred']])

        new_score = np.array([p[x] for x, p in zip(new_pred, cfsn_vecs.probs)])

        new_y_df = {
            'true': new_true,
            'pred': new_pred,
            'score': new_score,
            'weight': cfsn_vecs.data['weight'],
            'txs': cfsn_vecs.data['txs'],
            'pxs': cfsn_vecs.data['pxs'],
            'gid': cfsn_vecs.data['gid'],
        }
        new_y_df = kwarray.DataFrameArray(new_y_df)
        coarse_cfsn_vecs = ConfusionVectors(new_y_df, cfsn_vecs.classes,
                                            cfsn_vecs.probs)
        return coarse_cfsn_vecs
Beispiel #5
0
    def binarize_ovr(cfsn_vecs,
                     mode=1,
                     keyby='name',
                     ignore_classes={'ignore'}):
        """
        Transforms cfsn_vecs into one-vs-rest BinaryConfusionVectors for each category.

        Args:
            mode (int, default=1): 0 for heirarchy aware or 1 for voc like.
                MODE 0 IS PROBABLY BROKEN
            keyby (int | str) : can be cx or name
            ignore_classes (Set[str]): category names to ignore

        Returns:
            OneVsRestConfusionVectors: which behaves like
                Dict[int, BinaryConfusionVectors]: cx_to_binvecs

        Example:
            >>> # xdoctest: +REQUIRES(module:ndsampler)
            >>> cfsn_vecs = ConfusionVectors.demo()
            >>> print('cfsn_vecs = {!r}'.format(cfsn_vecs))
            >>> catname_to_binvecs = cfsn_vecs.binarize_ovr(keyby='name')
            >>> print('catname_to_binvecs = {!r}'.format(catname_to_binvecs))

        Notes:
            Consider we want to measure how well we can classify beagles.

            Given a multiclass confusion vector, we need to carefully select a
            subset. We ignore any truth that is coarser than our current label.
            We also ignore any background predictions on irrelevant classes

            y_true     | y_pred     | score
            -------------------------------
            dog        | dog          <- ignore coarser truths
            dog        | cat          <- ignore coarser truths
            dog        | beagle       <- ignore coarser truths
            cat        | dog
            cat        | cat
            cat        | background   <- ignore failures to predict unrelated classes
            cat        | maine-coon
            beagle     | beagle
            beagle     | dog
            beagle     | background
            beagle     | cat
            Snoopy     | beagle
            Snoopy     | cat
            maine-coon | background    <- ignore failures to predict unrelated classes
            maine-coon | beagle
            maine-coon | cat

            Anything not marked as ignore is counted. We count anything marked
            as beagle or a finer grained class (e.g.  Snoopy) as a positive
            case. All other cases are negative. The scores come from the
            predicted probability of beagle, which must be remembered outside
            the dataframe.
        """
        import kwarray

        classes = cfsn_vecs.classes
        data = cfsn_vecs.data

        if mode == 0:
            if cfsn_vecs.probs is None:
                raise ValueError('cannot binarize in mode=0 without probs')
            pdist = classes.idx_pairwise_distance()

        cx_to_binvecs = {}
        for cx in range(len(classes)):
            if classes[cx] == 'background' or classes[cx] in ignore_classes:
                continue

            if mode == 0:
                import warnings
                warnings.warn(
                    'THIS CALCLUATION MIGHT BE WRONG. MANY OTHERS '
                    'IN THIS FILE WERE, AND I HAVENT CHECKED THIS ONE YET')

                # Lookup original probability predictions for the class of interest
                new_scores = cfsn_vecs.probs[:, cx]

                # Determine which truth items have compatible classes
                # Note: we ignore any truth-label that is COARSER than the
                # class-of-interest.
                # E.g: how well do we classify Beagle? -> we should ignore any truth
                # label marked as Dog because it may or may not be a Beagle?
                with warnings.catch_warnings():
                    warnings.filterwarnings('ignore', category=RuntimeWarning)
                    dist = pdist[cx]
                    coarser_cxs = np.where(dist < 0)[0]
                    finer_eq_cxs = np.where(dist >= 0)[0]

                is_finer_eq = kwarray.isect_flags(data['true'], finer_eq_cxs)
                is_coarser = kwarray.isect_flags(data['true'], coarser_cxs)

                # Construct a binary data frame to pass to sklearn functions.
                bin_data = {
                    'is_true': is_finer_eq.astype(np.uint8),
                    'pred_score': new_scores,
                    'weight': data['weight'] * (np.float32(1.0) - is_coarser),
                    'txs': cfsn_vecs.data['txs'],
                    'pxs': cfsn_vecs.data['pxs'],
                    'gid': cfsn_vecs.data['gid'],
                }
                bin_data = kwarray.DataFrameArray(bin_data)

                # Ignore cases where we failed to predict an irrelevant class
                flags = (data['pred'] == -1) & (bin_data['is_true'] == 0)
                bin_data['weight'][flags] = 0
                # bin_data = bin_data.compress(~flags)
                bin_cfsn = BinaryConfusionVectors(bin_data, cx, classes)

            elif mode == 1:
                # More VOC-like, not heirarchy friendly

                if cfsn_vecs.probs is not None:
                    # We know the actual score predicted for this category in
                    # this case.
                    is_true = cfsn_vecs.data['true'] == cx
                    pred_score = cfsn_vecs.probs[:, cx]
                else:
                    import warnings
                    warnings.warn(
                        'Binarize ovr is only approximate if not all probabilities are known'
                    )
                    # If we don't know the probabilities for non-predicted
                    # categories then we have to guess.
                    is_true = cfsn_vecs.data['true'] == cx

                    # do we know the actual predicted score for this category?
                    score_is_unknown = data['pred'] != cx
                    pred_score = data['score'].copy()

                    # These scores were for a different class, so assume
                    # other classes were predicted with a uniform prior
                    approx_score = (1 - pred_score[score_is_unknown]) / (
                        len(classes) - 1)

                    # Except in the case where predicted class is -1. In this
                    # case no prediction was actually made (above a threshold)
                    # so the assumed score should be significantly lower, we
                    # conservatively choose zero.
                    unknown_preds = data['pred'][score_is_unknown]
                    approx_score[unknown_preds == -1] = 0

                    pred_score[score_is_unknown] = approx_score

                bin_data = {
                    # is_true denotes if the true class of the item is the
                    # category of interest.
                    'is_true': is_true,
                    'pred_score': pred_score,
                }

                extra = ub.dict_isect(data._data,
                                      ['txs', 'pxs', 'gid', 'weight'])
                bin_data.update(extra)

                bin_data = kwarray.DataFrameArray(bin_data)
                bin_cfsn = BinaryConfusionVectors(bin_data, cx, classes)
            cx_to_binvecs[cx] = bin_cfsn

        if keyby == 'cx':
            cx_to_binvecs = cx_to_binvecs
        elif keyby == 'name':
            cx_to_binvecs = ub.map_keys(cfsn_vecs.classes, cx_to_binvecs)
        else:
            raise KeyError(keyby)

        ovr_cfns = OneVsRestConfusionVectors(cx_to_binvecs, cfsn_vecs.classes)
        return ovr_cfns
Beispiel #6
0
    def confusion_vectors(dmet,
                          ovthresh=0.5,
                          bias=0,
                          gids=None,
                          compat='all',
                          prioritize='iou',
                          ignore_classes='ignore',
                          background_class=ub.NoParam,
                          verbose='auto',
                          workers=0):
        """
        Assigns predicted boxes to the true boxes so we can transform the
        detection problem into a classification problem for scoring.

        Args:

            ovthresh (float, default=0.5):
                bounding box overlap iou threshold required for assignment

            bias (float, default=0.0):
                for computing bounding box overlap, either 1 or 0

            gids (List[int], default=None):
                which subset of images ids to compute confusion metrics on. If
                not specified all images are used.

            compat (str, default='all'):
                can be ('ancestors' | 'mutex' | 'all').  determines which pred
                boxes are allowed to match which true boxes. If 'mutex', then
                pred boxes can only match true boxes of the same class. If
                'ancestors', then pred boxes can match true boxes that match or
                have a coarser label. If 'all', then any pred can match any
                true, regardless of its category label.

            prioritize (str, default='iou'):
                can be ('iou' | 'class' | 'correct') determines which box to
                assign to if mutiple true boxes overlap a predicted box.  if
                prioritize is iou, then the true box with maximum iou (above
                ovthresh) will be chosen.  If prioritize is class, then it will
                prefer matching a compatible class above a higher iou. If
                prioritize is correct, then ancestors of the true class are
                preferred over descendents of the true class, over unreleated
                classes.

            ignore_classes (set, default={'ignore'}):
                class names indicating ignore regions

            background_class (str, default=ub.NoParam):
                Name of the background class. If unspecified we try to
                determine it with heuristics. A value of None means there is no
                background class.

            verbose (int, default='auto'): verbosity flag. In auto mode,
                verbose=1 if len(gids) > 1000.

            workers (int, default=0):
                number of parallel assignment processes

        Ignore:
            globals().update(xdev.get_func_kwargs(dmet.confusion_vectors))
        """
        import kwarray
        y_accum = ub.ddict(list)

        TRACK_PROBS = True
        if TRACK_PROBS:
            prob_accum = []

        if gids is None:
            gids = sorted(dmet._imgname_to_gid.values())

        if verbose == 'auto':
            verbose = 1 if len(gids) > 10 else 0

        if background_class is ub.NoParam:
            # Try to autodetermine background class name,
            # otherwise fallback to None
            background_class = None
            if dmet.classes is not None:
                lower_classes = [c.lower() for c in dmet.classes]
                try:
                    idx = lower_classes.index('background')
                    background_class = dmet.classes[idx]
                    # TODO: if we know the background class name should we
                    # change bg_cidx in assignment?
                except ValueError:
                    pass

        from ndsampler.utils import util_futures
        workers = 0
        jobs = util_futures.JobPool(mode='process', max_workers=workers)

        for gid in ub.ProgIter(gids,
                               desc='submit assign jobs',
                               verbose=verbose):
            true_dets = dmet.true_detections(gid)
            pred_dets = dmet.pred_detections(gid)
            job = jobs.submit(_assign_confusion_vectors,
                              true_dets,
                              pred_dets,
                              bg_weight=1,
                              ovthresh=ovthresh,
                              bg_cidx=-1,
                              bias=bias,
                              classes=dmet.classes,
                              compat=compat,
                              prioritize=prioritize,
                              ignore_classes=ignore_classes)
            job.gid = gid

        for job in ub.ProgIter(jobs.jobs,
                               desc='assign detections',
                               verbose=verbose):
            y = job.result()
            gid = job.gid

            if TRACK_PROBS:
                # Keep track of per-class probs
                pred_dets = dmet.pred_detections(gid)
                try:
                    pred_probs = pred_dets.probs
                except KeyError:
                    TRACK_PROBS = False
                else:
                    pxs = np.array(y['pxs'], dtype=np.int)

                    # For unassigned truths, we need to create dummy probs
                    # where a background class has probability 1.
                    flags = pxs > -1
                    probs = np.zeros((len(pxs), pred_probs.shape[1]),
                                     dtype=np.float32)
                    if background_class is not None:
                        bg_idx = dmet.classes.index(background_class)
                        probs[:, bg_idx] = 1
                    probs[flags] = pred_probs[pxs[flags]]
                    prob_accum.append(probs)

            y['gid'] = [gid] * len(y['pred'])
            for k, v in y.items():
                y_accum[k].extend(v)

        # else:
        #     for gid in ub.ProgIter(gids, desc='assign detections', verbose=verbose):
        #         true_dets = dmet.true_detections(gid)
        #         pred_dets = dmet.pred_detections(gid)

        #         y = _assign_confusion_vectors(true_dets, pred_dets, bg_weight=1,
        #                                       ovthresh=ovthresh, bg_cidx=-1,
        #                                       bias=bias, classes=dmet.classes,
        #                                       compat=compat, prioritize=prioritize,
        #                                       ignore_classes=ignore_classes)

        #         if TRACK_PROBS:
        #             # Keep track of per-class probs
        #             try:
        #                 pred_probs = pred_dets.probs
        #             except KeyError:
        #                 TRACK_PROBS = False
        #             else:
        #                 pxs = np.array(y['pxs'], dtype=np.int)
        #                 flags = pxs > -1
        #                 probs = np.zeros((len(pxs), pred_probs.shape[1]),
        #                                  dtype=np.float32)
        #                 bg_idx = dmet.classes.node_to_idx['background']
        #                 probs[:, bg_idx] = 1
        #                 probs[flags] = pred_probs[pxs[flags]]
        #                 prob_accum.append(probs)

        #         y['gid'] = [gid] * len(y['pred'])
        #         for k, v in y.items():
        #             y_accum[k].extend(v)

        _data = {}
        for k, v in ub.ProgIter(list(y_accum.items()),
                                desc='ndarray convert',
                                verbose=verbose):
            # Try to use 32 bit types for large evaluation problems
            kw = dict()
            if k in {'iou', 'score', 'weight'}:
                kw['dtype'] = np.float32
            if k in {'pxs', 'txs', 'gid', 'pred', 'true', 'pred_raw'}:
                kw['dtype'] = np.int32
            try:
                _data[k] = np.asarray(v, **kw)
            except TypeError:
                _data[k] = np.asarray(v)

        # Avoid pandas when possible
        cfsn_data = kwarray.DataFrameArray(_data)

        if 0:
            import xdev
            nbytes = 0
            for k, v in _data.items():
                nbytes += v.size * v.dtype.itemsize
            print(xdev.byte_str(nbytes))

        if TRACK_PROBS:
            y_prob = np.vstack(prob_accum)
        else:
            y_prob = None
        cfsn_vecs = ConfusionVectors(cfsn_data,
                                     classes=dmet.classes,
                                     probs=y_prob)

        return cfsn_vecs
Beispiel #7
0
def tabular_coco_targets(dset):
    """
    Transforms COCO box annotations into a tabular form

    _ = xdev.profile_now(tabular_coco_targets)(dset)
    """
    import warnings
    # TODO: better handling of non-bounding box annotations; ignore for now

    if hasattr(dset, 'tabular_targets'):
        # In the SQL case, we can write a single query that
        # builds the table more efficiently.
        return dset.tabular_targets()

    img_items = list(dset.imgs.items())
    gid_to_width = {gid: img['width'] for gid, img in img_items}
    gid_to_height = {gid: img['height'] for gid, img in img_items}

    try:
        anns = dset.dataset['annotations']
        if not isinstance(anns, list):
            anns = list(anns)

        xywh = [ann['bbox'] for ann in anns]
        xywh = np.array(xywh, dtype=np.float32)
    except Exception:
        has_bbox = [ann.get('bbox', None) is not None for ann in anns]
        if not all(has_bbox):
            n_missing = len(has_bbox) - sum(has_bbox)
            warnings.warn('CocoDataset is missing boxes '
                          'for {} annotations'.format(n_missing))
        anns = list(ub.compress(anns, has_bbox))
        xywh = [ann['bbox'] for ann in anns]
        xywh = np.array(xywh, dtype=np.float32)

    boxes = kwimage.Boxes(xywh, 'xywh')
    cxywhs = boxes.to_cxywh().data.reshape(-1, 4)

    aids = [ann['id'] for ann in anns]
    gids = [ann['image_id'] for ann in anns]
    cids = [ann['category_id'] for ann in anns]

    img_width = [gid_to_width[gid] for gid in gids]
    img_height = [gid_to_height[gid] for gid in gids]

    aids = np.array(aids, dtype=np.int32)
    gids = np.array(gids, dtype=np.int32)
    cids = np.array(cids, dtype=np.int32)

    table = {
        # Annotation / Image / Category ids
        'aid': aids,
        'gid': gids,
        'category_id': cids,
        # Subpixel box localizations wrt parent image
        'cx': cxywhs.T[0],
        'cy': cxywhs.T[1],
        'width': cxywhs.T[2],
        'height': cxywhs.T[3],
    }

    # Parent image id and width / height
    table['img_width'] = np.array(img_width, dtype=np.int32)
    table['img_height'] = np.array(img_height, dtype=np.int32)

    # table = ub.map_vals(np.asarray, table)
    targets = kwarray.DataFrameArray(table)
    return targets
Beispiel #8
0
    def _random_negatives(self, num, exact=False, neg_anchors=None,
                          window_size=None, rng=None, thresh=0.0):
        """
        Samples multiple negatives at once for efficiency

        Args:
            num (int): number of negatives to sample

            exact (bool): if True, we will try to find exactly `num` negatives,
                otherwise the number returned is approximate.

            neg_anchors (): prior normalized aspect ratios for negative boxes.
                Mutually exclusive with `window_size`.

            window_size (Tuple): absolute box size (width, height)
                used to sample negative regions. If not specified the relative
                anchor strategy will be used to randomly choose potentially
                non-square regions relative to the image size.

            thresh (float): overlap area threshold as a percentage of the
                negative box size. When thresh=0.0, that means negatives cannot
                overlap any positive, when threh=1.0, there are no constrains
                on negative placement.

        Returns:
            DataFrameArray: targets - contains negative target information

        Example:
            >>> from ndsampler.coco_regions import *
            >>> from ndsampler import coco_sampler
            >>> self = coco_sampler.CocoSampler.demo().regions
            >>> num = 100
            >>> rng = kwarray.ensure_rng(0)
            >>> targets = self._random_negatives(num, rng=rng)
            >>> assert len(targets) <= num
            >>> targets = self._random_negatives(num, exact=True)
            >>> assert len(targets) == num
        """
        rng = kwarray.ensure_rng(rng)

        # Choose some number of centers in each dimension
        if neg_anchors is None and window_size is None:
            neg_anchors = self.neg_anchors

        gids, boxes = self.isect_index.random_negatives(
            num, anchors=neg_anchors, window_size=window_size, exact=exact,
            rng=rng, thresh=thresh)

        targets = kwarray.DataFrameArray()
        targets = kwarray.DataFrameArray(columns=['gid', 'aid', 'cx', 'cy',
                                                  'width', 'height',
                                                  'img_width', 'img_height'])
        targets['gid'] = gids
        targets['aid'] = [-1] * len(gids)
        targets['category_id'] = [self.BACKGROUND_CLASS_ID] * len(gids)
        if len(boxes) > 0:
            cxywh = boxes.to_cxywh().data
            targets['cx'] = cxywh.T[0]
            targets['cy'] = cxywh.T[1]
            targets['width'] = cxywh.T[2]
            targets['height'] = cxywh.T[3]

        if 0:
            targets['img_width'] = self.dset.images(gids).width
            targets['img_height'] = self.dset.images(gids).height
        else:
            imgs = [self.dset.imgs[gid] for gid in gids]
            targets['img_width'] = [img['width'] for img in imgs]
            targets['img_height'] = [img['height'] for img in imgs]
        return targets