def test_split_for_classification_single_class_single_attr(self): counts = {0: 10, 1: 20, 2: 30} config = {"label": {"attrs": ["attr"], "counts": counts}} source = self._generate_dataset(config) task = splitter.SplitTask.classification.name splits = [("train", 0.7), ("test", 0.3)] actual = splitter.Split(source, task, splits, seed=100) self.assertEqual(42, len(actual.get_subset("train"))) self.assertEqual(18, len(actual.get_subset("test"))) # check stats for train stat_train = compute_ann_statistics(actual.get_subset("train")) attr_train = stat_train["annotations"]["labels"]["attributes"] self.assertEqual(7, attr_train["attr"]["distribution"]["0"][0]) self.assertEqual(14, attr_train["attr"]["distribution"]["1"][0]) self.assertEqual(21, attr_train["attr"]["distribution"]["2"][0]) # check stats for test stat_test = compute_ann_statistics(actual.get_subset("test")) attr_test = stat_test["annotations"]["labels"]["attributes"] self.assertEqual(3, attr_test["attr"]["distribution"]["0"][0]) self.assertEqual(6, attr_test["attr"]["distribution"]["1"][0]) self.assertEqual(9, attr_test["attr"]["distribution"]["2"][0])
def test_split_for_classification_single_class_multi_attr(self): counts = { (0, 0): 20, (0, 1): 20, (0, 2): 30, (1, 0): 20, (1, 1): 10, (1, 2): 20, } attrs = ["attr1", "attr2"] config = {"label": {"attrs": attrs, "counts": counts}} source = self._generate_dataset(config) task = splitter.SplitTask.classification.name with self.subTest("zero remainder"): splits = [("train", 0.7), ("test", 0.3)] actual = splitter.Split(source, task, splits, seed=100) self.assertEqual(84, len(actual.get_subset("train"))) self.assertEqual(36, len(actual.get_subset("test"))) # check stats for train stat_train = compute_ann_statistics(actual.get_subset("train")) attr_train = stat_train["annotations"]["labels"]["attributes"] self.assertEqual(49, attr_train["attr1"]["distribution"]["0"][0]) self.assertEqual(35, attr_train["attr1"]["distribution"]["1"][0]) self.assertEqual(28, attr_train["attr2"]["distribution"]["0"][0]) self.assertEqual(21, attr_train["attr2"]["distribution"]["1"][0]) self.assertEqual(35, attr_train["attr2"]["distribution"]["2"][0]) # check stats for test stat_test = compute_ann_statistics(actual.get_subset("test")) attr_test = stat_test["annotations"]["labels"]["attributes"] self.assertEqual(21, attr_test["attr1"]["distribution"]["0"][0]) self.assertEqual(15, attr_test["attr1"]["distribution"]["1"][0]) self.assertEqual(12, attr_test["attr2"]["distribution"]["0"][0]) self.assertEqual(9, attr_test["attr2"]["distribution"]["1"][0]) self.assertEqual(15, attr_test["attr2"]["distribution"]["2"][0]) with self.subTest("non-zero remainder"): splits = [("train", 0.95), ("test", 0.05)] actual = splitter.Split(source, task, splits, seed=100) self.assertEqual(114, len(actual.get_subset("train"))) self.assertEqual(6, len(actual.get_subset("test")))
def stats_command(args): project = load_project(args.project_dir) dataset = project.make_dataset() stats = {} stats.update(compute_image_statistics(dataset)) stats.update(compute_ann_statistics(dataset)) dst_file = generate_next_file_name('statistics', ext='.json') log.info("Writing project statistics to '%s'" % dst_file) with open(dst_file, 'w') as f: json.dump(stats, f, indent=4, sort_keys=True)
def test_split_for_classification_multi_class_no_attr(self): config = { "label1": { "attrs": None, "counts": 10 }, "label2": { "attrs": None, "counts": 20 }, "label3": { "attrs": None, "counts": 30 }, } source = self._generate_dataset(config) task = splitter.SplitTask.classification.name splits = [("train", 0.7), ("test", 0.3)] actual = splitter.Split(source, task, splits, seed=100) self.assertEqual(42, len(actual.get_subset("train"))) self.assertEqual(18, len(actual.get_subset("test"))) # check stats for train stat_train = compute_ann_statistics(actual.get_subset("train")) dist_train = stat_train["annotations"]["labels"]["distribution"] self.assertEqual(7, dist_train["label1"][0]) self.assertEqual(14, dist_train["label2"][0]) self.assertEqual(21, dist_train["label3"][0]) # check stats for test stat_test = compute_ann_statistics(actual.get_subset("test")) dist_test = stat_test["annotations"]["labels"]["distribution"] self.assertEqual(3, dist_test["label1"][0]) self.assertEqual(6, dist_test["label2"][0]) self.assertEqual(9, dist_test["label3"][0])
def test_stats_with_empty_dataset(self): dataset = Dataset.from_iterable([ DatasetItem(id=1), DatasetItem(id=3), ], categories=['label_%s' % i for i in range(4)]) expected = { 'images count': 2, 'annotations count': 0, 'unannotated images count': 2, 'unannotated images': ['1', '3'], 'annotations by type': { 'label': { 'count': 0, }, 'polygon': { 'count': 0, }, 'polyline': { 'count': 0, }, 'bbox': { 'count': 0, }, 'mask': { 'count': 0, }, 'points': { 'count': 0, }, 'caption': { 'count': 0, }, 'cuboid_3d': {'count': 0}, }, 'annotations': { 'labels': { 'count': 0, 'distribution': { 'label_0': [0, 0.0], 'label_1': [0, 0.0], 'label_2': [0, 0.0], 'label_3': [0, 0.0], }, 'attributes': {} }, 'segments': { 'avg. area': 0.0, 'area distribution': [], 'pixel distribution': { 'label_0': [0, 0.0], 'label_1': [0, 0.0], 'label_2': [0, 0.0], 'label_3': [0, 0.0], }, } }, } actual = compute_ann_statistics(dataset) self.assertEqual(expected, actual)
def stats_command(args): project = None try: project = scope_add(load_project(args.project_dir)) except ProjectNotFoundError: if args.project_dir: raise dataset, target_project = parse_full_revpath(args.target, project) if target_project: scope_add(target_project) if args.subset: dataset = dataset.get_subset(args.subset) stats = {} if args.image_stats: stats.update(compute_image_statistics(dataset)) if args.ann_stats: stats.update(compute_ann_statistics(dataset)) dst_file = generate_next_file_name('statistics', ext='.json') log.info("Writing project statistics to '%s'" % dst_file) dump_json_file(dst_file, stats, indent=True)
def test_split_for_reidentification(self): ''' Test ReidentificationSplit using Dataset with label (ImageNet style) ''' def _get_present(stat): values_present = [] for label, dist in stat["distribution"].items(): if dist[0] > 0: values_present.append(label) return set(values_present) for with_attr in [True, False]: if with_attr: counts = {i: (i % 3 + 1) * 7 for i in range(10)} config = {"person": {"attrs": ["PID"], "counts": counts}} attr_for_id = "PID" else: counts = {} config = dict() for i in range(10): label = "label%d" % i count = (i % 3 + 1) * 7 counts[label] = count config[label] = {"attrs": None, "counts": count} attr_for_id = None source = self._generate_dataset(config) splits = [("train", 0.5), ("val", 0.2), ("test", 0.3)] query = 0.4 / 0.7 actual = splitter.ReidentificationSplit(source, splits, query, attr_for_id) stats = dict() for sname in ["train", "val", "test-query", "test-gallery"]: subset = actual.get_subset(sname) stat = compute_ann_statistics(subset)["annotations"]["labels"] if with_attr: stat = stat["attributes"]["PID"] stats[sname] = stat # check size of subsets self.assertEqual(65, stats["train"]["count"]) self.assertEqual(26, stats["val"]["count"]) self.assertEqual(18, stats["test-gallery"]["count"]) self.assertEqual(24, stats["test-query"]["count"]) # check ID separation between test set and others train_ids = _get_present(stats["train"]) test_ids = _get_present(stats["test-gallery"]) for pid in train_ids: assert pid not in test_ids self.assertEqual(7, len(train_ids)) self.assertEqual(3, len(test_ids)) self.assertEqual(train_ids, _get_present(stats["val"])) self.assertEqual(test_ids, _get_present(stats["test-query"])) # check trainval set statistics trainval = stats["train"]["count"] + stats["val"]["count"] expected_train_count = int(trainval * 0.5 / 0.7) expected_val_count = int(trainval * 0.2 / 0.7) self.assertEqual(expected_train_count, stats["train"]["count"]) self.assertEqual(expected_val_count, stats["val"]["count"]) dist_train = stats["train"]["distribution"] dist_val = stats["val"]["distribution"] for pid in train_ids: total = counts[int(pid)] if with_attr else counts[pid] self.assertEqual(int(total * 0.5 / 0.7), dist_train[pid][0]) self.assertEqual(int(total * 0.2 / 0.7), dist_val[pid][0]) # check teset set statistics dist_gallery = stats["test-gallery"]["distribution"] dist_query = stats["test-query"]["distribution"] for pid in test_ids: total = counts[int(pid)] if with_attr else counts[pid] self.assertEqual(int(total * 0.3 / 0.7), dist_gallery[pid][0]) self.assertEqual(int(total * 0.4 / 0.7), dist_query[pid][0])
def test_split_for_classification_multi_label_with_attr(self): counts = { (0, 0): 20, (0, 1): 20, (0, 2): 30, (1, 0): 20, (1, 1): 10, (1, 2): 20, } attr1 = ["attr1", "attr2"] attr2 = ["attr1", "attr3"] config = { "label1": { "attrs": attr1, "counts": counts }, "label2": { "attrs": attr2, "counts": counts }, } source = self._generate_dataset(config) splits = [("train", 0.7), ("test", 0.3)] actual = splitter.ClassificationSplit(source, splits) train = actual.get_subset("train") test = actual.get_subset("test") self.assertEqual(168, len(train)) self.assertEqual(72, len(test)) # check stats for train stat_train = compute_ann_statistics(train) dist_train = stat_train["annotations"]["labels"]["distribution"] self.assertEqual(84, dist_train["label1"][0]) self.assertEqual(84, dist_train["label2"][0]) attr_train = stat_train["annotations"]["labels"]["attributes"] self.assertEqual(49 * 2, attr_train["attr1"]["distribution"]["0"][0]) self.assertEqual(35 * 2, attr_train["attr1"]["distribution"]["1"][0]) self.assertEqual(28, attr_train["attr2"]["distribution"]["0"][0]) self.assertEqual(21, attr_train["attr2"]["distribution"]["1"][0]) self.assertEqual(35, attr_train["attr2"]["distribution"]["2"][0]) self.assertEqual(28, attr_train["attr3"]["distribution"]["0"][0]) self.assertEqual(21, attr_train["attr3"]["distribution"]["1"][0]) self.assertEqual(35, attr_train["attr3"]["distribution"]["2"][0]) # check stats for test stat_test = compute_ann_statistics(test) dist_test = stat_test["annotations"]["labels"]["distribution"] self.assertEqual(36, dist_test["label1"][0]) self.assertEqual(36, dist_test["label2"][0]) attr_test = stat_test["annotations"]["labels"]["attributes"] self.assertEqual(21 * 2, attr_test["attr1"]["distribution"]["0"][0]) self.assertEqual(15 * 2, attr_test["attr1"]["distribution"]["1"][0]) self.assertEqual(12, attr_test["attr2"]["distribution"]["0"][0]) self.assertEqual(9, attr_test["attr2"]["distribution"]["1"][0]) self.assertEqual(15, attr_test["attr2"]["distribution"]["2"][0]) self.assertEqual(12, attr_test["attr3"]["distribution"]["0"][0]) self.assertEqual(9, attr_test["attr3"]["distribution"]["1"][0]) self.assertEqual(15, attr_test["attr3"]["distribution"]["2"][0]) with self.subTest("random seed test"): r1 = splitter.ClassificationSplit(source, splits, seed=1234) r2 = splitter.ClassificationSplit(source, splits, seed=1234) r3 = splitter.ClassificationSplit(source, splits, seed=4321) self.assertEqual(list(r1.get_subset("test")), list(r2.get_subset("test"))) self.assertNotEqual(list(r1.get_subset("test")), list(r3.get_subset("test")))
def test_stats(self): dataset = Dataset.from_iterable( [ DatasetItem( id=1, image=np.ones((5, 5, 3)), annotations=[ Caption('hello'), Caption('world'), Label(2, attributes={ 'x': 1, 'y': '2', }), Bbox(1, 2, 2, 2, label=2, attributes={ 'score': 0.5, }), Bbox(5, 6, 2, 2, attributes={ 'x': 1, 'y': '3', 'occluded': True, }), Points([1, 2, 2, 0, 1, 1], label=0), Mask(label=3, image=np.array([ [0, 0, 1, 1, 1], [0, 0, 1, 1, 1], [0, 0, 1, 1, 1], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], ])), ]), DatasetItem( id=2, image=np.ones((2, 4, 3)), annotations=[ Label(2, attributes={ 'x': 2, 'y': '2', }), Bbox(1, 2, 2, 2, label=3, attributes={ 'score': 0.5, }), Bbox(5, 6, 2, 2, attributes={ 'x': 2, 'y': '3', 'occluded': False, }), ]), DatasetItem(id=3), ], categories=['label_%s' % i for i in range(4)]) expected = { 'images count': 3, 'annotations count': 10, 'unannotated images count': 1, 'unannotated images': ['3'], 'annotations by type': { 'label': { 'count': 2, }, 'polygon': { 'count': 0, }, 'polyline': { 'count': 0, }, 'bbox': { 'count': 4, }, 'mask': { 'count': 1, }, 'points': { 'count': 1, }, 'caption': { 'count': 2, }, }, 'annotations': { 'labels': { 'count': 6, 'distribution': { 'label_0': [1, 1 / 6], 'label_1': [0, 0.0], 'label_2': [3, 3 / 6], 'label_3': [2, 2 / 6], }, 'attributes': { 'x': { 'count': 2, # unnotations with no label are skipped 'values count': 2, 'values present': ['1', '2'], 'distribution': { '1': [1, 1 / 2], '2': [1, 1 / 2], }, }, 'y': { 'count': 2, # unnotations with no label are skipped 'values count': 1, 'values present': ['2'], 'distribution': { '2': [2, 2 / 2], }, }, # must not include "special" attributes like "occluded" } }, 'segments': { 'avg. area': (4 * 2 + 9 * 1) / 3, 'area distribution': [ { 'min': 4.0, 'max': 4.5, 'count': 2, 'percent': 2 / 3 }, { 'min': 4.5, 'max': 5.0, 'count': 0, 'percent': 0.0 }, { 'min': 5.0, 'max': 5.5, 'count': 0, 'percent': 0.0 }, { 'min': 5.5, 'max': 6.0, 'count': 0, 'percent': 0.0 }, { 'min': 6.0, 'max': 6.5, 'count': 0, 'percent': 0.0 }, { 'min': 6.5, 'max': 7.0, 'count': 0, 'percent': 0.0 }, { 'min': 7.0, 'max': 7.5, 'count': 0, 'percent': 0.0 }, { 'min': 7.5, 'max': 8.0, 'count': 0, 'percent': 0.0 }, { 'min': 8.0, 'max': 8.5, 'count': 0, 'percent': 0.0 }, { 'min': 8.5, 'max': 9.0, 'count': 1, 'percent': 1 / 3 }, ], 'pixel distribution': { 'label_0': [0, 0.0], 'label_1': [0, 0.0], 'label_2': [4, 4 / 17], 'label_3': [13, 13 / 17], }, } }, } actual = compute_ann_statistics(dataset) self.assertEqual(expected, actual)