def test_project_can_merge_item_annotations(self): class TestExtractor1(Extractor): def __iter__(self): yield DatasetItem(id=1, subset='train', annotations=[ Label(2, id=3), Label(3, attributes={'x': 1}), ]) class TestExtractor2(Extractor): def __iter__(self): yield DatasetItem(id=1, subset='train', annotations=[ Label(3, attributes={'x': 1}), Label(4, id=4), ]) project = Project() project.env.extractors.register('t1', TestExtractor1) project.env.extractors.register('t2', TestExtractor2) project.add_source('source1', {'format': 't1'}) project.add_source('source2', {'format': 't2'}) merged = project.make_dataset() self.assertEqual(1, len(merged)) item = next(iter(merged)) self.assertEqual(3, len(item.annotations))
def create_command(args): project_dir = osp.abspath(args.dst_dir) project_env_dir = osp.join(project_dir, DEFAULT_CONFIG.env_dir) if osp.isdir(project_env_dir) and os.listdir(project_env_dir): if not args.overwrite: raise CliException("Directory '%s' already exists " "(pass --overwrite to overwrite)" % project_env_dir) else: shutil.rmtree(project_env_dir, ignore_errors=True) own_dataset_dir = osp.join(project_dir, DEFAULT_CONFIG.dataset_dir) if osp.isdir(own_dataset_dir) and os.listdir(own_dataset_dir): if not args.overwrite: raise CliException("Directory '%s' already exists " "(pass --overwrite to overwrite)" % own_dataset_dir) else: # NOTE: remove the dir to avoid using data from previous project shutil.rmtree(own_dataset_dir) project_name = args.name if project_name is None: project_name = osp.basename(project_dir) log.info("Creating project at '%s'" % project_dir) Project.generate(project_dir, { 'project_name': project_name, }) log.info("Project has been created at '%s'" % project_dir) return 0
def test_custom_extractor_can_be_created(self): class CustomExtractor(Extractor): def __iter__(self): return iter([ DatasetItem(id=0, subset='train'), DatasetItem(id=1, subset='train'), DatasetItem(id=2, subset='train'), DatasetItem(id=3, subset='test'), DatasetItem(id=4, subset='test'), DatasetItem(id=1), DatasetItem(id=2), DatasetItem(id=3), ]) extractor_name = 'ext1' project = Project() project.env.extractors.register(extractor_name, CustomExtractor) project.add_source('src1', { 'url': 'path', 'format': extractor_name, }) dataset = project.make_dataset() compare_datasets(self, CustomExtractor(), dataset)
def __call__(self, path, **extra_params): from datumaro.components.project import Project # cyclic import project = Project() if path.endswith('.data') and osp.isfile(path): config_paths = [path] else: config_paths = glob(osp.join(path, '*.data')) if not osp.exists(path) or not config_paths: raise Exception("Failed to find 'yolo' dataset at '%s'" % path) for config_path in config_paths: log.info("Found a dataset at '%s'" % config_path) source_name = '%s_%s' % (osp.basename(osp.dirname(config_path)), osp.splitext( osp.basename(config_path))[0]) project.add_source( source_name, { 'url': config_path, 'format': 'yolo', 'options': dict(extra_params), }) return project
def test_can_batch_launch_custom_model(self): class TestExtractor(Extractor): def __iter__(self): for i in range(5): yield DatasetItem(id=i, subset='train', image=np.array([i])) class TestLauncher(Launcher): def launch(self, inputs): for i, inp in enumerate(inputs): yield [Label(attributes={'idx': i, 'data': inp.item()})] model_name = 'model' launcher_name = 'custom_launcher' project = Project() project.env.launchers.register(launcher_name, TestLauncher) project.add_model(model_name, {'launcher': launcher_name}) model = project.make_executable_model(model_name) extractor = TestExtractor() batch_size = 3 executor = InferenceWrapper(extractor, model, batch_size=batch_size) for item in executor: self.assertEqual(1, len(item.annotations)) self.assertEqual( int(item.id) % batch_size, item.annotations[0].attributes['idx']) self.assertEqual(int(item.id), item.annotations[0].attributes['data'])
def __call__(self, path, **extra_params): from datumaro.components.project import Project # cyclic import project = Project() subset_paths = self.find_subsets(path) if len(subset_paths) == 0: raise Exception("Failed to find 'cvat' dataset at '%s'" % path) for subset_path in subset_paths: if not osp.isfile(subset_path): continue log.info("Found a dataset at '%s'" % subset_path) subset_name = osp.splitext(osp.basename(subset_path))[0] project.add_source( subset_name, { 'url': subset_path, 'format': self.EXTRACTOR_NAME, 'options': dict(extra_params), }) return project
def test_can_batch_launch_custom_model(self): dataset = Dataset.from_iterable([ DatasetItem(id=i, subset='train', image=np.array([i])) for i in range(5) ], categories=['label']) class TestLauncher(Launcher): def launch(self, inputs): for i, inp in enumerate(inputs): yield [Label(0, attributes={'idx': i, 'data': inp.item()})] model_name = 'model' launcher_name = 'custom_launcher' project = Project() project.env.launchers.register(launcher_name, TestLauncher) project.add_model(model_name, {'launcher': launcher_name}) model = project.make_executable_model(model_name) batch_size = 3 executor = ModelTransform(dataset, model, batch_size=batch_size) for item in executor: self.assertEqual(1, len(item.annotations)) self.assertEqual( int(item.id) % batch_size, item.annotations[0].attributes['idx']) self.assertEqual(int(item.id), item.annotations[0].attributes['data'])
def __call__(self, path, **extra_params): from datumaro.components.project import Project # cyclic import project = Project() if path.endswith('.json') and osp.isfile(path): subset_paths = [path] else: subset_paths = glob(osp.join(path, '*.json')) if osp.basename( osp.normpath(path)) != DatumaroPath.ANNOTATIONS_DIR: path = osp.join(path, DatumaroPath.ANNOTATIONS_DIR) subset_paths += glob(osp.join(path, '*.json')) if len(subset_paths) == 0: raise Exception("Failed to find 'datumaro' dataset at '%s'" % path) for subset_path in subset_paths: if not osp.isfile(subset_path): continue log.info("Found a dataset at '%s'" % subset_path) subset_name = osp.splitext(osp.basename(subset_path))[0] project.add_source( subset_name, { 'url': subset_path, 'format': self.EXTRACTOR_NAME, 'options': dict(extra_params), }) return project
def test_source_datasets_can_be_merged(self): class TestExtractor(Extractor): def __init__(self, url, n=0, s=0): super().__init__(length=n) self.n = n self.s = s def __iter__(self): for i in range(self.n): yield DatasetItem(id=self.s + i, subset='train') e_name1 = 'e1' e_name2 = 'e2' n1 = 2 n2 = 4 project = Project() project.env.extractors.register(e_name1, lambda p: TestExtractor(p, n=n1)) project.env.extractors.register(e_name2, lambda p: TestExtractor(p, n=n2, s=n1)) project.add_source('source1', {'format': e_name1}) project.add_source('source2', {'format': e_name2}) dataset = project.make_dataset() self.assertEqual(n1 + n2, len(dataset))
def is_project_path(value): if value: try: Project.load(value) return True except Exception: pass return False
def test_project_own_dataset_can_be_modified(self): project = Project() dataset = project.make_dataset() item = DatasetItem(id=1) dataset.put(item) self.assertEqual(item, next(iter(dataset)))
def test_source_false_when_source_doesnt_exist(self): source_name = 'qwerty' project = Project() project.add_source(source_name) target = SourceTarget(project=project) status = target.test(source_name + '123') self.assertFalse(status)
def test_source_true_when_source_exists(self): source_name = 'qwerty' project = Project() project.add_source(source_name) target = SourceTarget(project=project) status = target.test(source_name) self.assertTrue(status)
def test_add_source(self): source_name = 'source' origin = Source({'url': 'path', 'format': 'ext'}) project = Project() project.add_source(source_name, origin) added = project.get_source(source_name) self.assertIsNotNone(added) self.assertEqual(added, origin)
def test_added_source_can_be_saved(self): source_name = 'source' origin = Source({ 'url': 'path', }) project = Project() project.add_source(source_name, origin) saved = project.config self.assertEqual(origin, saved.sources[source_name])
def test_can_save_and_load_own_dataset(self): with TestDir() as test_dir: src_project = Project() src_dataset = src_project.make_dataset() item = DatasetItem(id=1) src_dataset.put(item) src_dataset.save(test_dir) loaded_project = Project.load(test_dir) loaded_dataset = loaded_project.make_dataset() self.assertEqual(list(src_dataset), list(loaded_dataset))
def __call__(self, path, **extra_params): from datumaro.components.project import Project # cyclic import project = Project() subsets = self.find_subsets(path) if len(subsets) == 0: raise Exception("Failed to find dataset at '%s'" % path) for desc in subsets: source_name = osp.splitext(osp.basename(desc['url']))[0] project.add_source(source_name, desc) return project
def test_can_do_transform_with_custom_model(self): class TestExtractorSrc(Extractor): def __iter__(self): for i in range(2): yield DatasetItem(id=i, image=np.ones([2, 2, 3]) * i, annotations=[Label(i)]) class TestLauncher(Launcher): def launch(self, inputs): for inp in inputs: yield [Label(inp[0, 0, 0])] class TestConverter(Converter): def __call__(self, extractor, save_dir): for item in extractor: with open(osp.join(save_dir, '%s.txt' % item.id), 'w') as f: f.write(str(item.annotations[0].label) + '\n') class TestExtractorDst(Extractor): def __init__(self, url): super().__init__() self.items = [ osp.join(url, p) for p in sorted(os.listdir(url)) ] def __iter__(self): for path in self.items: with open(path, 'r') as f: index = osp.splitext(osp.basename(path))[0] label = int(f.readline().strip()) yield DatasetItem(id=index, annotations=[Label(label)]) model_name = 'model' launcher_name = 'custom_launcher' extractor_name = 'custom_extractor' project = Project() project.env.launchers.register(launcher_name, TestLauncher) project.env.extractors.register(extractor_name, TestExtractorSrc) project.env.converters.register(extractor_name, TestConverter) project.add_model(model_name, {'launcher': launcher_name}) project.add_source('source', {'format': extractor_name}) with TestDir() as test_dir: project.make_dataset().apply_model(model=model_name, save_dir=test_dir) result = Project.load(test_dir) result.env.extractors.register(extractor_name, TestExtractorDst) it = iter(result.make_dataset()) item1 = next(it) item2 = next(it) self.assertEqual(0, item1.annotations[0].label) self.assertEqual(1, item2.annotations[0].label)
def test_project_filter_can_be_applied(self): class TestExtractor(Extractor): def __iter__(self): for i in range(10): yield DatasetItem(id=i, subset='train') e_type = 'type' project = Project() project.env.extractors.register(e_type, TestExtractor) project.add_source('source', {'format': e_type}) dataset = project.make_dataset().extract('/item[id < 5]') self.assertEqual(5, len(dataset))
def __call__(self, path): from datumaro.components.project import Project # cyclic import project = Project() if not osp.exists(path): raise Exception("Failed to find 'datumaro' dataset at '%s'" % path) source_name = osp.splitext(osp.basename(path))[0] project.add_source(source_name, { 'url': path, 'format': self.EXTRACTOR_NAME, }) return project
def __call__(self, path, **extra_params): from datumaro.components.project import Project # cyclic import project = Project() if not osp.isdir(path): raise Exception("Can't find a directory at '%s'" % path) source_name = osp.basename(osp.normpath(path)) project.add_source(source_name, { 'url': source_name, 'format': self.EXTRACTOR_NAME, 'options': dict(extra_params), }) return project
def test_transform_fails_on_inplace_update_of_stage(self): with TestDir() as test_dir: dataset_url = osp.join(test_dir, 'dataset') dataset = Dataset.from_iterable([ DatasetItem(id=1, annotations=[Bbox(1, 2, 3, 4, label=1)]), ], categories=['a', 'b']) dataset.export(dataset_url, 'coco', save_images=True) project_dir = osp.join(test_dir, 'proj') with Project.init(project_dir) as project: project.import_source('source-1', dataset_url, 'coco', no_cache=True) project.commit('first commit') with self.subTest('without overwrite'): run(self, 'transform', '-p', project_dir, '-t', 'random_split', 'HEAD:source-1', expected_code=1) with self.subTest('with overwrite'): with self.assertRaises(ReadonlyDatasetError): run(self, 'transform', '-p', project_dir, '--overwrite', '-t', 'random_split', 'HEAD:source-1')
def test_can_release_resources_on_checkout(self, fxt_sample_video): test_dir = scope_add(TestDir()) project = scope_add(Project.init(test_dir)) src_url = osp.join(test_dir, 'src') src = Dataset.from_iterable([ DatasetItem(1), ], categories=['a']) src.save(src_url) project.add_source(src_url, 'datumaro') project.commit('commit 1') project.remove_source('src', keep_data=False) project.import_source('src', osp.dirname(fxt_sample_video), 'video_frames', rpath=osp.basename(fxt_sample_video)) project.commit('commit 2') assert len(project.working_tree.make_dataset()) == 4 assert osp.isdir(osp.join(test_dir, 'src')) project.checkout('HEAD~1') assert len(project.working_tree.make_dataset()) == 1
def main(args=None): parser = build_parser() args = parser.parse_args(args) project_path = args.project_dir if is_project_path(project_path): project = Project.load(project_path) else: project = None try: args.target = target_selector( ProjectTarget(is_default=True, project=project), SourceTarget(project=project), ExternalDatasetTarget(), ImageTarget() )(args.target) if args.target[0] == TargetKinds.project: if is_project_path(args.target[1]): args.project_dir = osp.dirname(osp.abspath(args.target[1])) except argparse.ArgumentTypeError as e: print(e) parser.print_help() return 1 return process_command(args.target, args.params, args)
def test_can_import(self): target_dataset = Dataset.from_iterable( [ DatasetItem(id=1, subset='train', image=np.ones((16, 16, 3)), annotations=[ Bbox(0, 4, 4, 8, label=2), Bbox(0, 4, 4, 4, label=3), Bbox(2, 4, 4, 4), ], attributes={'source_id': '1'}), DatasetItem(id=2, subset='val', image=np.ones((8, 8, 3)), annotations=[ Bbox(1, 2, 4, 2, label=3), ], attributes={'source_id': '2'}), DatasetItem(id=3, subset='test', image=np.ones((5, 4, 3)) * 3, attributes={'source_id': '3'}), ], categories={ AnnotationType.label: LabelCategories.from_iterable('label_' + str(label) for label in range(10)), }) dataset = Project.import_from(DUMMY_DATASET_DIR, 'tf_detection_api') \ .make_dataset() compare_datasets(self, target_dataset, dataset)
def test_can_import(self): class DstExtractor(Extractor): def __iter__(self): return iter([ DatasetItem(id=1, subset='train', image=np.ones((10, 15, 3)), annotations=[ Bbox(0, 2, 4, 2, label=2), Bbox(3, 3, 2, 3, label=4), ]), ]) def categories(self): label_categories = LabelCategories() for i in range(10): label_categories.add('label_' + str(i)) return { AnnotationType.label: label_categories, } dataset = Project.import_from(DUMMY_DATASET_DIR, 'yolo') \ .make_dataset() compare_datasets(self, DstExtractor(), dataset)
def test_can_import(self): expected_dataset = Dataset.from_iterable( [ DatasetItem(id=1, image=np.ones((16, 16, 3)), annotations=[ Bbox(0, 4, 4, 8, label=2, attributes={ 'occluded': False, 'visibility': 1.0, 'ignored': False, }), ]), ], categories={ AnnotationType.label: LabelCategories.from_iterable('label_' + str(label) for label in range(10)), }) dataset = Project.import_from(DUMMY_DATASET_DIR, 'mot_seq') \ .make_dataset() compare_datasets(self, expected_dataset, dataset)
def test_can_import(self): class DstExtractor(Extractor): def __iter__(self): return iter([ DatasetItem(id=1, image=np.ones((10, 5, 3)), subset='val', annotations=[ Polygon([0, 0, 1, 0, 1, 2, 0, 2], label=0, id=1, group=1, attributes={'is_crowd': False}), Mask(np.array([[1, 0, 0, 1, 0]] * 5 + [[1, 1, 1, 1, 0]] * 5), label=0, id=2, group=2, attributes={'is_crowd': True}), ]), ]) def categories(self): label_cat = LabelCategories() label_cat.add('TEST') return {AnnotationType.label: label_cat} with TestDir() as test_dir: self.COCO_dataset_generate(test_dir) dataset = Project.import_from(test_dir, 'coco').make_dataset() compare_datasets(self, DstExtractor(), dataset)
def test_can_save_and_load(self): with TestDir() as test_dir: source_dataset = self.TestExtractor() converter = DatumaroConverter(save_images=True, apply_colormap=True) converter(source_dataset, test_dir.path) project = Project.import_from(test_dir.path, 'datumaro') parsed_dataset = project.make_dataset() self.assertListEqual( sorted(source_dataset.subsets()), sorted(parsed_dataset.subsets()), ) self.assertEqual(len(source_dataset), len(parsed_dataset)) for subset_name in source_dataset.subsets(): source_subset = source_dataset.get_subset(subset_name) parsed_subset = parsed_dataset.get_subset(subset_name) for idx, (item_a, item_b) in enumerate( zip_longest(source_subset, parsed_subset)): self.assertEqual(item_a, item_b, str(idx)) self.assertEqual(source_dataset.categories(), parsed_dataset.categories())
def test_ambiguous_format(self): test_dir = scope_add(TestDir()) dataset_url = osp.join(test_dir, 'source') # create an ambiguous dataset by merging annotations from # datasets in different formats annotation_dir = osp.join(dataset_url, 'training/street') assets_dir = osp.join(osp.dirname(__file__), '../assets') os.makedirs(annotation_dir) for asset in [ 'ade20k2017_dataset/dataset/training/street/1_atr.txt', 'ade20k2020_dataset/dataset/training/street/1.json', ]: shutil.copy(osp.join(assets_dir, asset), annotation_dir) with self.subTest("no context"): with self.assertRaises(WrongRevpathError) as cm: parse_full_revpath(dataset_url) self.assertEqual({ProjectNotFoundError, MultipleFormatsMatchError}, set(type(e) for e in cm.exception.problems)) proj_dir = osp.join(test_dir, 'proj') proj = scope_add(Project.init(proj_dir)) with self.subTest("in context"): with self.assertRaises(WrongRevpathError) as cm: parse_full_revpath(dataset_url, proj) self.assertEqual({UnknownTargetError, MultipleFormatsMatchError}, set(type(e) for e in cm.exception.problems))