Exemple #1
0
    def test_split_for_classification_single_class_single_attr(self):
        counts = {0: 10, 1: 20, 2: 30}
        config = {"label": {"attrs": ["attr"], "counts": counts}}
        source = self._generate_dataset(config)
        task = splitter.SplitTask.classification.name

        splits = [("train", 0.7), ("test", 0.3)]
        actual = splitter.Split(source, task, splits, seed=100)

        self.assertEqual(42, len(actual.get_subset("train")))
        self.assertEqual(18, len(actual.get_subset("test")))

        # check stats for train
        stat_train = compute_ann_statistics(actual.get_subset("train"))
        attr_train = stat_train["annotations"]["labels"]["attributes"]
        self.assertEqual(7, attr_train["attr"]["distribution"]["0"][0])
        self.assertEqual(14, attr_train["attr"]["distribution"]["1"][0])
        self.assertEqual(21, attr_train["attr"]["distribution"]["2"][0])

        # check stats for test
        stat_test = compute_ann_statistics(actual.get_subset("test"))
        attr_test = stat_test["annotations"]["labels"]["attributes"]
        self.assertEqual(3, attr_test["attr"]["distribution"]["0"][0])
        self.assertEqual(6, attr_test["attr"]["distribution"]["1"][0])
        self.assertEqual(9, attr_test["attr"]["distribution"]["2"][0])
Exemple #2
0
    def test_split_for_classification_single_class_multi_attr(self):
        counts = {
            (0, 0): 20,
            (0, 1): 20,
            (0, 2): 30,
            (1, 0): 20,
            (1, 1): 10,
            (1, 2): 20,
        }
        attrs = ["attr1", "attr2"]
        config = {"label": {"attrs": attrs, "counts": counts}}
        source = self._generate_dataset(config)
        task = splitter.SplitTask.classification.name

        with self.subTest("zero remainder"):
            splits = [("train", 0.7), ("test", 0.3)]
            actual = splitter.Split(source, task, splits, seed=100)

            self.assertEqual(84, len(actual.get_subset("train")))
            self.assertEqual(36, len(actual.get_subset("test")))

            # check stats for train
            stat_train = compute_ann_statistics(actual.get_subset("train"))
            attr_train = stat_train["annotations"]["labels"]["attributes"]
            self.assertEqual(49, attr_train["attr1"]["distribution"]["0"][0])
            self.assertEqual(35, attr_train["attr1"]["distribution"]["1"][0])
            self.assertEqual(28, attr_train["attr2"]["distribution"]["0"][0])
            self.assertEqual(21, attr_train["attr2"]["distribution"]["1"][0])
            self.assertEqual(35, attr_train["attr2"]["distribution"]["2"][0])

            # check stats for test
            stat_test = compute_ann_statistics(actual.get_subset("test"))
            attr_test = stat_test["annotations"]["labels"]["attributes"]
            self.assertEqual(21, attr_test["attr1"]["distribution"]["0"][0])
            self.assertEqual(15, attr_test["attr1"]["distribution"]["1"][0])
            self.assertEqual(12, attr_test["attr2"]["distribution"]["0"][0])
            self.assertEqual(9, attr_test["attr2"]["distribution"]["1"][0])
            self.assertEqual(15, attr_test["attr2"]["distribution"]["2"][0])

        with self.subTest("non-zero remainder"):
            splits = [("train", 0.95), ("test", 0.05)]
            actual = splitter.Split(source, task, splits, seed=100)

            self.assertEqual(114, len(actual.get_subset("train")))
            self.assertEqual(6, len(actual.get_subset("test")))
Exemple #3
0
def stats_command(args):
    project = load_project(args.project_dir)

    dataset = project.make_dataset()
    stats = {}
    stats.update(compute_image_statistics(dataset))
    stats.update(compute_ann_statistics(dataset))

    dst_file = generate_next_file_name('statistics', ext='.json')
    log.info("Writing project statistics to '%s'" % dst_file)
    with open(dst_file, 'w') as f:
        json.dump(stats, f, indent=4, sort_keys=True)
Exemple #4
0
    def test_split_for_classification_multi_class_no_attr(self):
        config = {
            "label1": {
                "attrs": None,
                "counts": 10
            },
            "label2": {
                "attrs": None,
                "counts": 20
            },
            "label3": {
                "attrs": None,
                "counts": 30
            },
        }
        source = self._generate_dataset(config)
        task = splitter.SplitTask.classification.name

        splits = [("train", 0.7), ("test", 0.3)]
        actual = splitter.Split(source, task, splits, seed=100)

        self.assertEqual(42, len(actual.get_subset("train")))
        self.assertEqual(18, len(actual.get_subset("test")))

        # check stats for train
        stat_train = compute_ann_statistics(actual.get_subset("train"))
        dist_train = stat_train["annotations"]["labels"]["distribution"]
        self.assertEqual(7, dist_train["label1"][0])
        self.assertEqual(14, dist_train["label2"][0])
        self.assertEqual(21, dist_train["label3"][0])

        # check stats for test
        stat_test = compute_ann_statistics(actual.get_subset("test"))
        dist_test = stat_test["annotations"]["labels"]["distribution"]
        self.assertEqual(3, dist_test["label1"][0])
        self.assertEqual(6, dist_test["label2"][0])
        self.assertEqual(9, dist_test["label3"][0])
Exemple #5
0
    def test_stats_with_empty_dataset(self):
        dataset = Dataset.from_iterable([
            DatasetItem(id=1),
            DatasetItem(id=3),
        ], categories=['label_%s' % i for i in range(4)])

        expected = {
            'images count': 2,
            'annotations count': 0,
            'unannotated images count': 2,
            'unannotated images': ['1', '3'],
            'annotations by type': {
                'label': { 'count': 0, },
                'polygon': { 'count': 0, },
                'polyline': { 'count': 0, },
                'bbox': { 'count': 0, },
                'mask': { 'count': 0, },
                'points': { 'count': 0, },
                'caption': { 'count': 0, },
                'cuboid_3d': {'count': 0},
            },
            'annotations': {
                'labels': {
                    'count': 0,
                    'distribution': {
                        'label_0': [0, 0.0],
                        'label_1': [0, 0.0],
                        'label_2': [0, 0.0],
                        'label_3': [0, 0.0],
                    },
                    'attributes': {}
                },
                'segments': {
                    'avg. area': 0.0,
                    'area distribution': [],
                    'pixel distribution': {
                        'label_0': [0, 0.0],
                        'label_1': [0, 0.0],
                        'label_2': [0, 0.0],
                        'label_3': [0, 0.0],
                    },
                }
            },
        }

        actual = compute_ann_statistics(dataset)

        self.assertEqual(expected, actual)
Exemple #6
0
def stats_command(args):
    project = None
    try:
        project = scope_add(load_project(args.project_dir))
    except ProjectNotFoundError:
        if args.project_dir:
            raise

    dataset, target_project = parse_full_revpath(args.target, project)
    if target_project:
        scope_add(target_project)

    if args.subset:
        dataset = dataset.get_subset(args.subset)

    stats = {}
    if args.image_stats:
        stats.update(compute_image_statistics(dataset))
    if args.ann_stats:
        stats.update(compute_ann_statistics(dataset))

    dst_file = generate_next_file_name('statistics', ext='.json')
    log.info("Writing project statistics to '%s'" % dst_file)
    dump_json_file(dst_file, stats, indent=True)
Exemple #7
0
    def test_split_for_reidentification(self):
        '''
        Test ReidentificationSplit using Dataset with label (ImageNet style)
        '''
        def _get_present(stat):
            values_present = []
            for label, dist in stat["distribution"].items():
                if dist[0] > 0:
                    values_present.append(label)
            return set(values_present)

        for with_attr in [True, False]:
            if with_attr:
                counts = {i: (i % 3 + 1) * 7 for i in range(10)}
                config = {"person": {"attrs": ["PID"], "counts": counts}}
                attr_for_id = "PID"
            else:
                counts = {}
                config = dict()
                for i in range(10):
                    label = "label%d" % i
                    count = (i % 3 + 1) * 7
                    counts[label] = count
                    config[label] = {"attrs": None, "counts": count}
                attr_for_id = None
            source = self._generate_dataset(config)
            splits = [("train", 0.5), ("val", 0.2), ("test", 0.3)]
            query = 0.4 / 0.7
            actual = splitter.ReidentificationSplit(source, splits, query,
                                                    attr_for_id)

            stats = dict()
            for sname in ["train", "val", "test-query", "test-gallery"]:
                subset = actual.get_subset(sname)
                stat = compute_ann_statistics(subset)["annotations"]["labels"]
                if with_attr:
                    stat = stat["attributes"]["PID"]
                stats[sname] = stat

            # check size of subsets
            self.assertEqual(65, stats["train"]["count"])
            self.assertEqual(26, stats["val"]["count"])
            self.assertEqual(18, stats["test-gallery"]["count"])
            self.assertEqual(24, stats["test-query"]["count"])

            # check ID separation between test set and others
            train_ids = _get_present(stats["train"])
            test_ids = _get_present(stats["test-gallery"])
            for pid in train_ids:
                assert pid not in test_ids
            self.assertEqual(7, len(train_ids))
            self.assertEqual(3, len(test_ids))
            self.assertEqual(train_ids, _get_present(stats["val"]))
            self.assertEqual(test_ids, _get_present(stats["test-query"]))

            # check trainval set statistics
            trainval = stats["train"]["count"] + stats["val"]["count"]
            expected_train_count = int(trainval * 0.5 / 0.7)
            expected_val_count = int(trainval * 0.2 / 0.7)
            self.assertEqual(expected_train_count, stats["train"]["count"])
            self.assertEqual(expected_val_count, stats["val"]["count"])
            dist_train = stats["train"]["distribution"]
            dist_val = stats["val"]["distribution"]
            for pid in train_ids:
                total = counts[int(pid)] if with_attr else counts[pid]
                self.assertEqual(int(total * 0.5 / 0.7), dist_train[pid][0])
                self.assertEqual(int(total * 0.2 / 0.7), dist_val[pid][0])

            # check teset set statistics
            dist_gallery = stats["test-gallery"]["distribution"]
            dist_query = stats["test-query"]["distribution"]
            for pid in test_ids:
                total = counts[int(pid)] if with_attr else counts[pid]
                self.assertEqual(int(total * 0.3 / 0.7), dist_gallery[pid][0])
                self.assertEqual(int(total * 0.4 / 0.7), dist_query[pid][0])
Exemple #8
0
    def test_split_for_classification_multi_label_with_attr(self):
        counts = {
            (0, 0): 20,
            (0, 1): 20,
            (0, 2): 30,
            (1, 0): 20,
            (1, 1): 10,
            (1, 2): 20,
        }
        attr1 = ["attr1", "attr2"]
        attr2 = ["attr1", "attr3"]
        config = {
            "label1": {
                "attrs": attr1,
                "counts": counts
            },
            "label2": {
                "attrs": attr2,
                "counts": counts
            },
        }
        source = self._generate_dataset(config)

        splits = [("train", 0.7), ("test", 0.3)]
        actual = splitter.ClassificationSplit(source, splits)

        train = actual.get_subset("train")
        test = actual.get_subset("test")
        self.assertEqual(168, len(train))
        self.assertEqual(72, len(test))

        # check stats for train
        stat_train = compute_ann_statistics(train)
        dist_train = stat_train["annotations"]["labels"]["distribution"]
        self.assertEqual(84, dist_train["label1"][0])
        self.assertEqual(84, dist_train["label2"][0])
        attr_train = stat_train["annotations"]["labels"]["attributes"]
        self.assertEqual(49 * 2, attr_train["attr1"]["distribution"]["0"][0])
        self.assertEqual(35 * 2, attr_train["attr1"]["distribution"]["1"][0])
        self.assertEqual(28, attr_train["attr2"]["distribution"]["0"][0])
        self.assertEqual(21, attr_train["attr2"]["distribution"]["1"][0])
        self.assertEqual(35, attr_train["attr2"]["distribution"]["2"][0])
        self.assertEqual(28, attr_train["attr3"]["distribution"]["0"][0])
        self.assertEqual(21, attr_train["attr3"]["distribution"]["1"][0])
        self.assertEqual(35, attr_train["attr3"]["distribution"]["2"][0])

        # check stats for test
        stat_test = compute_ann_statistics(test)
        dist_test = stat_test["annotations"]["labels"]["distribution"]
        self.assertEqual(36, dist_test["label1"][0])
        self.assertEqual(36, dist_test["label2"][0])
        attr_test = stat_test["annotations"]["labels"]["attributes"]
        self.assertEqual(21 * 2, attr_test["attr1"]["distribution"]["0"][0])
        self.assertEqual(15 * 2, attr_test["attr1"]["distribution"]["1"][0])
        self.assertEqual(12, attr_test["attr2"]["distribution"]["0"][0])
        self.assertEqual(9, attr_test["attr2"]["distribution"]["1"][0])
        self.assertEqual(15, attr_test["attr2"]["distribution"]["2"][0])
        self.assertEqual(12, attr_test["attr3"]["distribution"]["0"][0])
        self.assertEqual(9, attr_test["attr3"]["distribution"]["1"][0])
        self.assertEqual(15, attr_test["attr3"]["distribution"]["2"][0])

        with self.subTest("random seed test"):
            r1 = splitter.ClassificationSplit(source, splits, seed=1234)
            r2 = splitter.ClassificationSplit(source, splits, seed=1234)
            r3 = splitter.ClassificationSplit(source, splits, seed=4321)
            self.assertEqual(list(r1.get_subset("test")),
                             list(r2.get_subset("test")))
            self.assertNotEqual(list(r1.get_subset("test")),
                                list(r3.get_subset("test")))
Exemple #9
0
    def test_stats(self):
        dataset = Dataset.from_iterable(
            [
                DatasetItem(
                    id=1,
                    image=np.ones((5, 5, 3)),
                    annotations=[
                        Caption('hello'),
                        Caption('world'),
                        Label(2, attributes={
                            'x': 1,
                            'y': '2',
                        }),
                        Bbox(1, 2, 2, 2, label=2, attributes={
                            'score': 0.5,
                        }),
                        Bbox(5,
                             6,
                             2,
                             2,
                             attributes={
                                 'x': 1,
                                 'y': '3',
                                 'occluded': True,
                             }),
                        Points([1, 2, 2, 0, 1, 1], label=0),
                        Mask(label=3,
                             image=np.array([
                                 [0, 0, 1, 1, 1],
                                 [0, 0, 1, 1, 1],
                                 [0, 0, 1, 1, 1],
                                 [0, 0, 0, 0, 0],
                                 [0, 0, 0, 0, 0],
                             ])),
                    ]),
                DatasetItem(
                    id=2,
                    image=np.ones((2, 4, 3)),
                    annotations=[
                        Label(2, attributes={
                            'x': 2,
                            'y': '2',
                        }),
                        Bbox(1, 2, 2, 2, label=3, attributes={
                            'score': 0.5,
                        }),
                        Bbox(5,
                             6,
                             2,
                             2,
                             attributes={
                                 'x': 2,
                                 'y': '3',
                                 'occluded': False,
                             }),
                    ]),
                DatasetItem(id=3),
            ],
            categories=['label_%s' % i for i in range(4)])

        expected = {
            'images count': 3,
            'annotations count': 10,
            'unannotated images count': 1,
            'unannotated images': ['3'],
            'annotations by type': {
                'label': {
                    'count': 2,
                },
                'polygon': {
                    'count': 0,
                },
                'polyline': {
                    'count': 0,
                },
                'bbox': {
                    'count': 4,
                },
                'mask': {
                    'count': 1,
                },
                'points': {
                    'count': 1,
                },
                'caption': {
                    'count': 2,
                },
            },
            'annotations': {
                'labels': {
                    'count': 6,
                    'distribution': {
                        'label_0': [1, 1 / 6],
                        'label_1': [0, 0.0],
                        'label_2': [3, 3 / 6],
                        'label_3': [2, 2 / 6],
                    },
                    'attributes': {
                        'x': {
                            'count':
                            2,  # unnotations with no label are skipped
                            'values count': 2,
                            'values present': ['1', '2'],
                            'distribution': {
                                '1': [1, 1 / 2],
                                '2': [1, 1 / 2],
                            },
                        },
                        'y': {
                            'count':
                            2,  # unnotations with no label are skipped
                            'values count': 1,
                            'values present': ['2'],
                            'distribution': {
                                '2': [2, 2 / 2],
                            },
                        },
                        # must not include "special" attributes like "occluded"
                    }
                },
                'segments': {
                    'avg. area': (4 * 2 + 9 * 1) / 3,
                    'area distribution': [
                        {
                            'min': 4.0,
                            'max': 4.5,
                            'count': 2,
                            'percent': 2 / 3
                        },
                        {
                            'min': 4.5,
                            'max': 5.0,
                            'count': 0,
                            'percent': 0.0
                        },
                        {
                            'min': 5.0,
                            'max': 5.5,
                            'count': 0,
                            'percent': 0.0
                        },
                        {
                            'min': 5.5,
                            'max': 6.0,
                            'count': 0,
                            'percent': 0.0
                        },
                        {
                            'min': 6.0,
                            'max': 6.5,
                            'count': 0,
                            'percent': 0.0
                        },
                        {
                            'min': 6.5,
                            'max': 7.0,
                            'count': 0,
                            'percent': 0.0
                        },
                        {
                            'min': 7.0,
                            'max': 7.5,
                            'count': 0,
                            'percent': 0.0
                        },
                        {
                            'min': 7.5,
                            'max': 8.0,
                            'count': 0,
                            'percent': 0.0
                        },
                        {
                            'min': 8.0,
                            'max': 8.5,
                            'count': 0,
                            'percent': 0.0
                        },
                        {
                            'min': 8.5,
                            'max': 9.0,
                            'count': 1,
                            'percent': 1 / 3
                        },
                    ],
                    'pixel distribution': {
                        'label_0': [0, 0.0],
                        'label_1': [0, 0.0],
                        'label_2': [4, 4 / 17],
                        'label_3': [13, 13 / 17],
                    },
                }
            },
        }

        actual = compute_ann_statistics(dataset)

        self.assertEqual(expected, actual)