def test_add_source(self): source_name = 'source' origin = Source({'url': 'path', 'format': 'ext'}) project = Project() project.add_source(source_name, origin) added = project.get_source(source_name) self.assertIsNotNone(added) self.assertEqual(added, origin)
def test_can_have_project_source(self): with TestDir() as test_dir: Project.generate(test_dir) project2 = Project() project2.add_source('project1', { 'url': test_dir, }) dataset = project2.make_dataset() self.assertTrue('project1' in dataset.sources)
def test_added_source_can_be_saved(self): source_name = 'source' origin = Source({ 'url': 'path', }) project = Project() project.add_source(source_name, origin) saved = project.config self.assertEqual(origin, saved.sources[source_name])
def test_can_do_transform_with_custom_model(self): class TestExtractorSrc(Extractor): def __iter__(self): for i in range(2): yield DatasetItem(id=i, image=np.ones([2, 2, 3]) * i, annotations=[Label(i)]) def categories(self): label_cat = LabelCategories() label_cat.add('0') label_cat.add('1') return {AnnotationType.label: label_cat} class TestLauncher(Launcher): def launch(self, inputs): for inp in inputs: yield [Label(inp[0, 0, 0])] class TestExtractorDst(Extractor): def __init__(self, url): super().__init__() self.items = [ osp.join(url, p) for p in sorted(os.listdir(url)) ] def __iter__(self): for path in self.items: with open(path, 'r') as f: index = osp.splitext(osp.basename(path))[0] label = int(f.readline().strip()) yield DatasetItem(id=index, annotations=[Label(label)]) model_name = 'model' launcher_name = 'custom_launcher' extractor_name = 'custom_extractor' project = Project() project.env.launchers.register(launcher_name, TestLauncher) project.env.extractors.register(extractor_name, TestExtractorSrc) project.add_model(model_name, {'launcher': launcher_name}) project.add_source('source', {'format': extractor_name}) with TestDir() as test_dir: project.make_dataset().apply_model(model=model_name, save_dir=test_dir) result = Project.load(test_dir) result.env.extractors.register(extractor_name, TestExtractorDst) it = iter(result.make_dataset()) item1 = next(it) item2 = next(it) self.assertEqual(0, item1.annotations[0].label) self.assertEqual(1, item2.annotations[0].label)
def __call__(self, path, **extra_params): from datumaro.components.project import Project # cyclic import project = Project() subsets = self.find_subsets(path) if len(subsets) == 0: raise Exception("Failed to find dataset at '%s'" % path) for desc in subsets: source_name = osp.splitext(osp.basename(desc['url']))[0] project.add_source(source_name, desc) return project
def test_project_filter_can_be_applied(self): class TestExtractor(Extractor): def __iter__(self): for i in range(10): yield DatasetItem(id=i, subset='train') e_type = 'type' project = Project() project.env.extractors.register(e_type, TestExtractor) project.add_source('source', {'format': e_type}) dataset = project.make_dataset().extract('/item[id < 5]') self.assertEqual(5, len(dataset))
def test_added_source_can_be_dumped(self): source_name = 'source' origin = Source({ 'url': 'path', }) project = Project() project.add_source(source_name, origin) with TestDir() as test_dir: project.save(test_dir) loaded = Project.load(test_dir) loaded = loaded.get_source(source_name) self.assertEqual(origin, loaded)
def __call__(self, path): from datumaro.components.project import Project # cyclic import project = Project() if not osp.exists(path): raise Exception("Failed to find 'datumaro' dataset at '%s'" % path) source_name = osp.splitext(osp.basename(path))[0] project.add_source(source_name, { 'url': path, 'format': self.EXTRACTOR_NAME, }) return project
def __call__(self, path, **extra_params): from datumaro.components.project import Project # cyclic import project = Project() if not osp.isdir(path): raise Exception("Can't find a directory at '%s'" % path) source_name = osp.basename(osp.normpath(path)) project.add_source(source_name, { 'url': source_name, 'format': self.EXTRACTOR_NAME, 'options': dict(extra_params), }) return project
def __call__(self, path, **extra_params): from datumaro.components.project import Project # cyclic import project = Project() sources = self.find_sources(osp.normpath(path)) if len(sources) == 0: raise Exception("Failed to find dataset at '%s'" % path) for desc in sources: params = dict(extra_params) params.update(desc.get('options', {})) desc['options'] = params source_name = osp.splitext(osp.basename(desc['url']))[0] project.add_source(source_name, desc) return project
def check_data(json_path): # create Datumaro project project = Project() # add source project.add_source('src1', { 'url': str(json_path), 'format': 'coco_instances' }) # create a dataset dataset = project.make_dataset() print(f'{json_path.stem}') print(f'num images: {num_img(dataset)}') print(f'num images with annotations: {num_img_with_annots(dataset)}') print(f'num annotations: {num_annots(dataset)}')
def __call__(self, path): from datumaro.components.project import Project # cyclic import project = Project() for task, extractor_type, task_dir in self._TASKS: task_dir = osp.join(path, VocPath.SUBSETS_DIR, task_dir) if not osp.isdir(task_dir): continue project.add_source(task.name, { 'url': path, 'format': extractor_type, }) if len(project.config.sources) == 0: raise Exception("Failed to find 'voc' dataset at '%s'" % path) return project
def __call__(self, path, **extra_params): from datumaro.components.project import Project # cyclic import project = Project() subset_paths = self.find_subsets(path) if len(subset_paths) == 0: raise Exception("Failed to find 'voc' dataset at '%s'" % path) for task, extractor_type, subset_path in subset_paths: project.add_source( '%s-%s' % (task.name, osp.splitext(osp.basename(subset_path))[0]), { 'url': subset_path, 'format': extractor_type, 'options': dict(extra_params), }) return project
def __call__(self, path, **extra_params): from datumaro.components.project import Project # cyclic import project = Project() subsets = self.find_subsets(path) if len(subsets) == 0: raise Exception("Failed to find 'mot' dataset at '%s'" % path) for ann_file in subsets: log.info("Found a dataset at '%s'" % ann_file) source_name = osp.splitext(osp.basename(ann_file))[0] project.add_source(source_name, { 'url': ann_file, 'format': self._EXTRACTOR_NAME, 'options': extra_params, }) return project
def __call__(self, path, **extra_params): from datumaro.components.project import Project # cyclic import project = Project() subsets = self.find_subsets(path) if len(subsets) == 0: raise Exception("Failed to find 'coco' dataset at '%s'" % path) # TODO: should be removed when proper label merging is implemented conflicting_types = { CocoTask.instances, CocoTask.person_keypoints, CocoTask.labels } ann_types = set(t for s in subsets.values() for t in s) \ & conflicting_types if 1 <= len(ann_types): selected_ann_type = sorted(ann_types, key=lambda x: x.name)[0] if 1 < len(ann_types): log.warning("Not implemented: " "Found potentially conflicting source types with labels: %s. " "Only one type will be used: %s" \ % (", ".join(t.name for t in ann_types), selected_ann_type.name)) for ann_files in subsets.values(): for ann_type, ann_file in ann_files.items(): if ann_type in conflicting_types: if ann_type is not selected_ann_type: log.warning("Not implemented: " "conflicting source '%s' is skipped." % ann_file) continue log.info("Found a dataset at '%s'" % ann_file) source_name = osp.splitext(osp.basename(ann_file))[0] project.add_source( source_name, { 'url': ann_file, 'format': self._COCO_EXTRACTORS[ann_type], 'options': dict(extra_params), }) return project
def __call__(self, path, **extra_params): from datumaro.components.project import Project # cyclic import project = Project() subsets = self.find_subsets(path) if len(subsets) == 0: raise Exception("Failed to find 'coco' dataset at '%s'" % path) for ann_files in subsets.values(): for ann_type, ann_file in ann_files.items(): source_name = osp.splitext(osp.basename(ann_file))[0] project.add_source( source_name, { 'url': ann_file, 'format': self._COCO_EXTRACTORS[ann_type], 'options': extra_params, }) return project
def test_project_can_merge_item_annotations(self): class TestExtractor(Extractor): def __init__(self, url, v=None): super().__init__() self.v = v def __iter__(self): v1_item = DatasetItem(id=1, subset='train', annotations=[ LabelObject(2, id=3), LabelObject(3, attributes={'x': 1}), ]) v2_item = DatasetItem(id=1, subset='train', annotations=[ LabelObject(3, attributes={'x': 1}), LabelObject(4, id=4), ]) if self.v == 1: yield v1_item else: yield v2_item def subsets(self): return ['train'] project = Project() project.env.extractors.register('t1', lambda p: TestExtractor(p, v=1)) project.env.extractors.register('t2', lambda p: TestExtractor(p, v=2)) project.add_source('source1', {'format': 't1'}) project.add_source('source2', {'format': 't2'}) merged = project.make_dataset() self.assertEqual(1, len(merged)) item = next(iter(merged)) self.assertEqual(3, len(item.annotations))
def __call__(self, path, **extra_params): from datumaro.components.project import Project # cyclic import project = Project() config_paths = self.find_configs(path) if len(config_paths) == 0: raise Exception("Failed to find 'yolo' dataset at '%s'" % path) for config_path in config_paths: log.info("Found a dataset at '%s'" % config_path) source_name = '%s_%s' % ( osp.basename(osp.dirname(config_path)), osp.splitext(osp.basename(config_path))[0]) project.add_source(source_name, { 'url': config_path, 'format': 'yolo', 'options': dict(extra_params), }) return project
def __call__(self, path): from datumaro.components.project import Project # cyclic import project = Project() for task_name, extractor_type, task_dir in self._TASKS: task_dir = osp.join(path, task_dir) if not osp.isdir(task_dir): continue dir_items = os.listdir(task_dir) if not find(dir_items, lambda x: x == task_name): continue project.add_source(task_name, { 'url': task_dir, 'format': extractor_type, }) if len(project.config.sources) == 0: raise Exception("Failed to find 'voc_results' dataset at '%s'" % \ path) return project
def __call__(self, path, **extra_params): from datumaro.components.project import Project # cyclic import project = Project() subset_paths = self.find_subsets(path) if len(subset_paths) == 0: raise Exception("Failed to find 'label_me' dataset at '%s'" % path) for subset_path, subset_name in subset_paths: params = {} if subset_name: params['subset_name'] = subset_name params.update(extra_params) source_name = osp.splitext(osp.basename(subset_path))[0] project.add_source(source_name, { 'url': subset_path, 'format': self._EXTRACTOR_NAME, 'options': params, }) return project
def test_project_filter_can_be_applied(self): class TestExtractor(Extractor): def __init__(self, url, n=10): super().__init__(length=n) self.n = n def __iter__(self): for i in range(self.n): yield DatasetItem(id=i, subset='train') def subsets(self): return ['train'] e_type = 'type' project = Project() project.env.extractors.register(e_type, TestExtractor) project.add_source('source', {'format': e_type}) project.set_filter('/item[id < 5]') dataset = project.make_dataset() self.assertEqual(5, len(dataset))
def __call__(self, path, **extra_params): from datumaro.components.project import Project # cyclic import project = Project() subset_paths = self.find_subsets(path) if len(subset_paths) == 0: raise Exception("Failed to find 'cvat' dataset at '%s'" % path) for subset_path in subset_paths: if not osp.isfile(subset_path): continue log.info("Found a dataset at '%s'" % subset_path) subset_name = osp.splitext(osp.basename(subset_path))[0] project.add_source(subset_name, { 'url': subset_path, 'format': self.EXTRACTOR_NAME, 'options': dict(extra_params), }) return project
ap = argparse.ArgumentParser() group = ap.add_mutually_exclusive_group(required=True) group.add_argument('--json_paths', nargs="+", help='json paths separated by whitespace') group.add_argument('--annots_folder', help='path of annotation folder containing multiple jsons') ap.add_argument('--output_json', help='path of output json', required=True) args = ap.parse_args() # create Datumaro project project = Project() # add sources if args.json_paths: for i, json_path in enumerate(args.json_paths): new_json_path = check_json_path(json_path) project.add_source(f'src{i}', {'url': str(new_json_path), 'format': 'coco_instances'}) elif args.annots_folder: # doesnt recursively search in subfolders for i, json_path in enumerate(Path(args.annots_folder).iterdir()): if json_path.suffix == '.json': new_json_path = check_json_path(json_path) project.add_source(f'src{i}', {'url': str(new_json_path), 'format': 'coco_instances'}) # create a dataset dataset = project.make_dataset() # print some stats print(f'num images: {num_img(dataset)}') print(f'num images with annotations: {num_img_with_annots(dataset)}') print(f'num annotations: {num_annots(dataset)}')
from datumaro.components.project import Project from datum_utils import num_img, num_img_with_annots, num_annots, export_json ap = argparse.ArgumentParser() ap.add_argument('--json_path', help='annotations json path', required=True) args = ap.parse_args() # WRITE YOUR SPLIT HERE splits = {'train': ['set00', 'set01'], 'val': ['set02'], 'test': ['set03']} # create Datumaro project project = Project() # add sources project.add_source('src1', {'url': args.json_path, 'format': 'coco_instances'}) # create a dataset dataset = project.make_dataset() print(f'total images: {num_img(dataset)}') for split_name, split_list in splits.items(): # DEFINE SPLIT FUNCTION HERE dataset_split = dataset.select( lambda item: item.id.startswith(tuple(split_list))) # print some stats print(f'split: {split_name}') print(f'num images: {num_img(dataset_split)}') print(f'num images with annotations: {num_img_with_annots(dataset_split)}') print(f'num annotations: {num_annots(dataset_split)}')
def test_can_do_transform_with_custom_model(self): class TestExtractorSrc(Extractor): def __init__(self, url, n=2): super().__init__(length=n) self.n = n def __iter__(self): for i in range(self.n): yield DatasetItem(id=i, subset='train', image=i, annotations=[LabelObject(i)]) def subsets(self): return ['train'] class TestLauncher(Launcher): def __init__(self, **kwargs): pass def launch(self, inputs): for inp in inputs: yield [LabelObject(inp)] class TestConverter(Converter): def __call__(self, extractor, save_dir): for item in extractor: with open(osp.join(save_dir, '%s.txt' % item.id), 'w+') as f: f.write(str(item.subset) + '\n') f.write(str(item.annotations[0].label) + '\n') class TestExtractorDst(Extractor): def __init__(self, url): super().__init__() self.items = [ osp.join(url, p) for p in sorted(os.listdir(url)) ] def __iter__(self): for path in self.items: with open(path, 'r') as f: index = osp.splitext(osp.basename(path))[0] subset = f.readline()[:-1] label = int(f.readline()[:-1]) assert (subset == 'train') yield DatasetItem(id=index, subset=subset, annotations=[LabelObject(label)]) def __len__(self): return len(self.items) def subsets(self): return ['train'] model_name = 'model' launcher_name = 'custom_launcher' extractor_name = 'custom_extractor' project = Project() project.env.launchers.register(launcher_name, TestLauncher) project.env.extractors.register(extractor_name, TestExtractorSrc) project.env.converters.register(extractor_name, TestConverter) project.add_model(model_name, {'launcher': launcher_name}) project.add_source('source', {'format': extractor_name}) with TestDir() as test_dir: project.make_dataset().transform(model_name, test_dir.path) result = Project.load(test_dir.path) result.env.extractors.register(extractor_name, TestExtractorDst) it = iter(result.make_dataset()) item1 = next(it) item2 = next(it) self.assertEqual(0, item1.annotations[0].label) self.assertEqual(1, item2.annotations[0].label)