Esempio n. 1
0
    def test_cant_merge_different_categories(self):
        class TestExtractor1(Extractor):
            def __iter__(self):
                return iter([])

            def categories(self):
                return {
                    AnnotationType.label:
                    LabelCategories.from_iterable(['a', 'b'])
                }

        class TestExtractor2(Extractor):
            def __iter__(self):
                return iter([])

            def categories(self):
                return {
                    AnnotationType.label:
                    LabelCategories.from_iterable(['b', 'a'])
                }

        e_name1 = 'e1'
        e_name2 = 'e2'

        project = Project()
        project.env.extractors.register(e_name1, TestExtractor1)
        project.env.extractors.register(e_name2, TestExtractor2)
        project.add_source('source1', {'format': e_name1})
        project.add_source('source2', {'format': e_name2})

        with self.assertRaisesRegex(Exception, "different categories"):
            project.make_dataset()
Esempio n. 2
0
    def test_can_do_transform_with_custom_model(self):
        class TestExtractorSrc(Extractor):
            def __iter__(self):
                for i in range(2):
                    yield DatasetItem(id=i,
                                      image=np.ones([2, 2, 3]) * i,
                                      annotations=[Label(i)])

        class TestLauncher(Launcher):
            def launch(self, inputs):
                for inp in inputs:
                    yield [Label(inp[0, 0, 0])]

        class TestConverter(Converter):
            def __call__(self, extractor, save_dir):
                for item in extractor:
                    with open(osp.join(save_dir, '%s.txt' % item.id),
                              'w') as f:
                        f.write(str(item.annotations[0].label) + '\n')

        class TestExtractorDst(Extractor):
            def __init__(self, url):
                super().__init__()
                self.items = [
                    osp.join(url, p) for p in sorted(os.listdir(url))
                ]

            def __iter__(self):
                for path in self.items:
                    with open(path, 'r') as f:
                        index = osp.splitext(osp.basename(path))[0]
                        label = int(f.readline().strip())
                        yield DatasetItem(id=index, annotations=[Label(label)])

        model_name = 'model'
        launcher_name = 'custom_launcher'
        extractor_name = 'custom_extractor'

        project = Project()
        project.env.launchers.register(launcher_name, TestLauncher)
        project.env.extractors.register(extractor_name, TestExtractorSrc)
        project.env.converters.register(extractor_name, TestConverter)
        project.add_model(model_name, {'launcher': launcher_name})
        project.add_source('source', {'format': extractor_name})

        with TestDir() as test_dir:
            project.make_dataset().apply_model(model=model_name,
                                               save_dir=test_dir)

            result = Project.load(test_dir)
            result.env.extractors.register(extractor_name, TestExtractorDst)
            it = iter(result.make_dataset())
            item1 = next(it)
            item2 = next(it)
            self.assertEqual(0, item1.annotations[0].label)
            self.assertEqual(1, item2.annotations[0].label)
Esempio n. 3
0
    def test_custom_extractor_can_be_created(self):
        class CustomExtractor(Extractor):
            def __init__(self, url):
                super().__init__()

            def __iter__(self):
                return iter([
                    DatasetItem(id=0, subset='train'),
                    DatasetItem(id=1, subset='train'),
                    DatasetItem(id=2, subset='train'),
                    DatasetItem(id=3, subset='test'),
                ])

            def subsets(self):
                return ['train', 'test']

        extractor_name = 'ext1'
        project = Project()
        project.env.extractors.register(extractor_name, CustomExtractor)
        project.add_source('src1', {
            'url': 'path',
            'format': extractor_name,
        })
        project.set_subsets(['train'])

        dataset = project.make_dataset()

        self.assertEqual(3, len(dataset))
Esempio n. 4
0
    def test_project_can_merge_item_annotations(self):
        class TestExtractor1(Extractor):
            def __iter__(self):
                yield DatasetItem(id=1,
                                  subset='train',
                                  annotations=[
                                      Label(2, id=3),
                                      Label(3, attributes={'x': 1}),
                                  ])

        class TestExtractor2(Extractor):
            def __iter__(self):
                yield DatasetItem(id=1,
                                  subset='train',
                                  annotations=[
                                      Label(3, attributes={'x': 1}),
                                      Label(4, id=4),
                                  ])

        project = Project()
        project.env.extractors.register('t1', TestExtractor1)
        project.env.extractors.register('t2', TestExtractor2)
        project.add_source('source1', {'format': 't1'})
        project.add_source('source2', {'format': 't2'})

        merged = project.make_dataset()

        self.assertEqual(1, len(merged))

        item = next(iter(merged))
        self.assertEqual(3, len(item.annotations))
Esempio n. 5
0
    def test_project_compound_child_can_be_modified_recursively(self):
        with TestDir() as test_dir:
            child1 = Project({
                'project_dir': osp.join(test_dir, 'child1'),
            })
            child1.save()

            child2 = Project({
                'project_dir': osp.join(test_dir, 'child2'),
            })
            child2.save()

            parent = Project()
            parent.add_source('child1', {'url': child1.config.project_dir})
            parent.add_source('child2', {'url': child2.config.project_dir})
            dataset = parent.make_dataset()

            item1 = DatasetItem(id='ch1', path=['child1'])
            item2 = DatasetItem(id='ch2', path=['child2'])
            dataset.put(item1)
            dataset.put(item2)

            self.assertEqual(2, len(dataset))
            self.assertEqual(1, len(dataset.sources['child1']))
            self.assertEqual(1, len(dataset.sources['child2']))
Esempio n. 6
0
    def test_source_datasets_can_be_merged(self):
        class TestExtractor(Extractor):
            def __init__(self, url, n=0, s=0):
                super().__init__(length=n)
                self.n = n
                self.s = s

            def __iter__(self):
                for i in range(self.n):
                    yield DatasetItem(id=self.s + i, subset='train')

        e_name1 = 'e1'
        e_name2 = 'e2'
        n1 = 2
        n2 = 4

        project = Project()
        project.env.extractors.register(e_name1,
                                        lambda p: TestExtractor(p, n=n1))
        project.env.extractors.register(e_name2,
                                        lambda p: TestExtractor(p, n=n2, s=n1))
        project.add_source('source1', {'format': e_name1})
        project.add_source('source2', {'format': e_name2})

        dataset = project.make_dataset()

        self.assertEqual(n1 + n2, len(dataset))
Esempio n. 7
0
    def test_custom_extractor_can_be_created(self):
        class CustomExtractor(Extractor):
            def __iter__(self):
                return iter([
                    DatasetItem(id=0, subset='train'),
                    DatasetItem(id=1, subset='train'),
                    DatasetItem(id=2, subset='train'),
                    DatasetItem(id=3, subset='test'),
                    DatasetItem(id=4, subset='test'),
                    DatasetItem(id=1),
                    DatasetItem(id=2),
                    DatasetItem(id=3),
                ])

        extractor_name = 'ext1'
        project = Project()
        project.env.extractors.register(extractor_name, CustomExtractor)
        project.add_source('src1', {
            'url': 'path',
            'format': extractor_name,
        })

        dataset = project.make_dataset()

        compare_datasets(self, CustomExtractor(), dataset)
Esempio n. 8
0
    def test_project_own_dataset_can_be_modified(self):
        project = Project()
        dataset = project.make_dataset()

        item = DatasetItem(id=1)
        dataset.put(item)

        self.assertEqual(item, next(iter(dataset)))
Esempio n. 9
0
    def test_can_have_project_source(self):
        with TestDir() as test_dir:
            Project.generate(test_dir)

            project2 = Project()
            project2.add_source('project1', {
                'url': test_dir,
            })
            dataset = project2.make_dataset()

            self.assertTrue('project1' in dataset.sources)
Esempio n. 10
0
    def test_can_save_and_load_own_dataset(self):
        with TestDir() as test_dir:
            src_project = Project()
            src_dataset = src_project.make_dataset()
            item = DatasetItem(id=1)
            src_dataset.put(item)
            src_dataset.save(test_dir)

            loaded_project = Project.load(test_dir)
            loaded_dataset = loaded_project.make_dataset()

            self.assertEqual(list(src_dataset), list(loaded_dataset))
Esempio n. 11
0
    def test_project_filter_can_be_applied(self):
        class TestExtractor(Extractor):
            def __iter__(self):
                for i in range(10):
                    yield DatasetItem(id=i, subset='train')

        e_type = 'type'
        project = Project()
        project.env.extractors.register(e_type, TestExtractor)
        project.add_source('source', {'format': e_type})

        dataset = project.make_dataset().extract('/item[id < 5]')

        self.assertEqual(5, len(dataset))
Esempio n. 12
0
def check_data(json_path):
    # create Datumaro project
    project = Project()

    # add source
    project.add_source('src1', {
        'url': str(json_path),
        'format': 'coco_instances'
    })

    # create a dataset
    dataset = project.make_dataset()
    print(f'{json_path.stem}')

    print(f'num images: {num_img(dataset)}')
    print(f'num images with annotations: {num_img_with_annots(dataset)}')
    print(f'num annotations: {num_annots(dataset)}')
Esempio n. 13
0
    def test_project_can_merge_item_annotations(self):
        class TestExtractor(Extractor):
            def __init__(self, url, v=None):
                super().__init__()
                self.v = v

            def __iter__(self):
                v1_item = DatasetItem(id=1,
                                      subset='train',
                                      annotations=[
                                          LabelObject(2, id=3),
                                          LabelObject(3, attributes={'x': 1}),
                                      ])

                v2_item = DatasetItem(id=1,
                                      subset='train',
                                      annotations=[
                                          LabelObject(3, attributes={'x': 1}),
                                          LabelObject(4, id=4),
                                      ])

                if self.v == 1:
                    yield v1_item
                else:
                    yield v2_item

            def subsets(self):
                return ['train']

        project = Project()
        project.env.extractors.register('t1', lambda p: TestExtractor(p, v=1))
        project.env.extractors.register('t2', lambda p: TestExtractor(p, v=2))
        project.add_source('source1', {'format': 't1'})
        project.add_source('source2', {'format': 't2'})

        merged = project.make_dataset()

        self.assertEqual(1, len(merged))

        item = next(iter(merged))
        self.assertEqual(3, len(item.annotations))
Esempio n. 14
0
    def test_project_filter_can_be_applied(self):
        class TestExtractor(Extractor):
            def __init__(self, url, n=10):
                super().__init__(length=n)
                self.n = n

            def __iter__(self):
                for i in range(self.n):
                    yield DatasetItem(id=i, subset='train')

            def subsets(self):
                return ['train']

        e_type = 'type'
        project = Project()
        project.env.extractors.register(e_type, TestExtractor)
        project.add_source('source', {'format': e_type})
        project.set_filter('/item[id < 5]')

        dataset = project.make_dataset()

        self.assertEqual(5, len(dataset))
Esempio n. 15
0
def merge(cleaned_datasets, output, save_images=False):
    """datum merge -o {output} {project_dirs}"""

    print(f"Merging datasets to {output}/")
    projects = [Project.load(p) for p in cleaned_datasets]
    datasets = [p.make_dataset() for p in projects]

    merged_project_dir = Path(output)

    # perform the merge
    merge_config = IntersectMerge.Conf(
        pairwise_dist=0.25,
        groups=[],
        output_conf_thresh=0.0,
        quorum=0,
    )
    merged_dataset = IntersectMerge(conf=merge_config)(datasets)

    merged_project = Project()
    output_dataset = merged_project.make_dataset()
    output_dataset.define_categories(merged_dataset.categories())
    merged_dataset = output_dataset.update(merged_dataset)
    merged_dataset.save(save_dir=merged_project_dir, save_images=save_images)
Esempio n. 16
0
def merge_command(args):
    source_projects = [load_project(p) for p in args.project]

    dst_dir = args.dst_dir
    if dst_dir:
        if not args.overwrite and osp.isdir(dst_dir) and os.listdir(dst_dir):
            raise CliException("Directory '%s' already exists "
                               "(pass --overwrite to overwrite)" % dst_dir)
    else:
        dst_dir = generate_next_file_name('merged')

    source_datasets = []
    for p in source_projects:
        log.debug("Loading project '%s' dataset", p.config.project_name)
        source_datasets.append(p.make_dataset())

    merger = IntersectMerge(
        conf=IntersectMerge.Conf(pairwise_dist=args.iou_thresh,
                                 groups=args.groups,
                                 output_conf_thresh=args.output_conf_thresh,
                                 quorum=args.quorum))
    merged_dataset = merger(source_datasets)

    merged_project = Project()
    output_dataset = merged_project.make_dataset()
    output_dataset.define_categories(merged_dataset.categories())
    merged_dataset = output_dataset.update(merged_dataset)
    merged_dataset.save(save_dir=dst_dir)

    report_path = osp.join(dst_dir, 'merge_report.json')
    save_merge_report(merger, report_path)

    dst_dir = osp.abspath(dst_dir)
    log.info("Merge results have been saved to '%s'" % dst_dir)
    log.info("Report has been saved to '%s'" % report_path)

    return 0
Esempio n. 17
0
    def test_can_do_transform_with_custom_model(self):
        class TestExtractorSrc(Extractor):
            def __init__(self, url, n=2):
                super().__init__(length=n)
                self.n = n

            def __iter__(self):
                for i in range(self.n):
                    yield DatasetItem(id=i,
                                      subset='train',
                                      image=i,
                                      annotations=[LabelObject(i)])

            def subsets(self):
                return ['train']

        class TestLauncher(Launcher):
            def __init__(self, **kwargs):
                pass

            def launch(self, inputs):
                for inp in inputs:
                    yield [LabelObject(inp)]

        class TestConverter(Converter):
            def __call__(self, extractor, save_dir):
                for item in extractor:
                    with open(osp.join(save_dir, '%s.txt' % item.id),
                              'w+') as f:
                        f.write(str(item.subset) + '\n')
                        f.write(str(item.annotations[0].label) + '\n')

        class TestExtractorDst(Extractor):
            def __init__(self, url):
                super().__init__()
                self.items = [
                    osp.join(url, p) for p in sorted(os.listdir(url))
                ]

            def __iter__(self):
                for path in self.items:
                    with open(path, 'r') as f:
                        index = osp.splitext(osp.basename(path))[0]
                        subset = f.readline()[:-1]
                        label = int(f.readline()[:-1])
                        assert (subset == 'train')
                        yield DatasetItem(id=index,
                                          subset=subset,
                                          annotations=[LabelObject(label)])

            def __len__(self):
                return len(self.items)

            def subsets(self):
                return ['train']

        model_name = 'model'
        launcher_name = 'custom_launcher'
        extractor_name = 'custom_extractor'

        project = Project()
        project.env.launchers.register(launcher_name, TestLauncher)
        project.env.extractors.register(extractor_name, TestExtractorSrc)
        project.env.converters.register(extractor_name, TestConverter)
        project.add_model(model_name, {'launcher': launcher_name})
        project.add_source('source', {'format': extractor_name})

        with TestDir() as test_dir:
            project.make_dataset().transform(model_name, test_dir.path)

            result = Project.load(test_dir.path)
            result.env.extractors.register(extractor_name, TestExtractorDst)
            it = iter(result.make_dataset())
            item1 = next(it)
            item2 = next(it)
            self.assertEqual(0, item1.annotations[0].label)
            self.assertEqual(1, item2.annotations[0].label)
Esempio n. 18
0
group.add_argument('--json_paths', nargs="+", help='json paths separated by whitespace')
group.add_argument('--annots_folder', help='path of annotation folder containing multiple jsons')
ap.add_argument('--output_json', help='path of output json', required=True)
args = ap.parse_args()

# create Datumaro project
project = Project()

# add sources
if args.json_paths:
	for i, json_path in enumerate(args.json_paths):
		new_json_path = check_json_path(json_path)
		project.add_source(f'src{i}', {'url': str(new_json_path), 'format': 'coco_instances'})
elif args.annots_folder:
	# doesnt recursively search in subfolders
	for i, json_path in enumerate(Path(args.annots_folder).iterdir()):
		if json_path.suffix == '.json':
			new_json_path = check_json_path(json_path)
			project.add_source(f'src{i}', {'url': str(new_json_path), 'format': 'coco_instances'})

# create a dataset
dataset = project.make_dataset()

# print some stats
print(f'num images: {num_img(dataset)}')
print(f'num images with annotations: {num_img_with_annots(dataset)}')
print(f'num annotations: {num_annots(dataset)}')

# export the resulting json in COCO format
export_json(dataset, args.output_json)
Esempio n. 19
0
    def mergeDataset(self, import_args: Arg, filter_arg: Arg):
        config = setConfig(import_args['format'])
        source_datasets = dict([(path, Environment().make_importer(
            import_args['format'])(str(path)).make_dataset())
                                for path in self.datasetPathList])
        itemIdsAndPath = reduce(lambda x, y: x + y,
                                [[(item.id, path) for item in dataset]
                                 for path, dataset in source_datasets.items()])
        # for itemId, path in itemIdsAndPath:
        for path, dataset in source_datasets.items():
            itemIdsInPath = set(
                [itemId for itemId, _path in itemIdsAndPath if _path == path])
            itemIdsOutPath = set(
                [itemId for itemId, _path in itemIdsAndPath if _path != path])
            if itemIdsInPath & itemIdsOutPath:
                for subsetName, subset in dataset.subsets().items():
                    imgDir: Path = path / config.getImgDir(subsetName)
                    _subset = deepcopy(subset.items)
                    for item in _subset.values():
                        imgFile = Path(item.image.path)
                        relPath = imgFile.relative_to(imgDir)
                        newPath = imgDir / path.name / relPath
                        oldItemId = item.id
                        newItemId = item.id = str(path.name / relPath.parent /
                                                  relPath.stem).replace(
                                                      '\\', '/')
                        item.image._path = str(newPath)
                        del subset.items[oldItemId]
                        subset.items[newItemId] = item
                        newPath.parent.mkdir(parents=True, exist_ok=True)

                        if item.image.has_data:
                            move(str(imgFile),
                                 str(imgDir / path.name / relPath))

        mergePath = (self.projectsPath / self.mergeFolderName)
        if mergePath.is_dir():
            rmtree(mergePath, onerror=remove_readonly)
        mergePath.mkdir(exist_ok=True, parents=True)
        dst_dir = str(mergePath)

        merger = IntersectMerge(conf=IntersectMerge.Conf())
        merged_dataset = merger(list(source_datasets.values()))

        merged_project = Project()
        output_dataset = merged_project.make_dataset()
        output_dataset.define_categories(merged_dataset.categories())
        merged_dataset = output_dataset.update(merged_dataset)
        if filter_arg['no_anno_filter'].lower() == 'y':
            filtered_dataset = Project().make_dataset()
            filtered_dataset.define_categories(merged_dataset.categories())
            merged_dataset = filtered_dataset.update(
                merged_dataset.select(lambda item: len(item.annotations) != 0))
        annoId = 1
        imageIdName = config.imageIdName
        for idx, item in tqdm(enumerate(merged_dataset), desc='datasets'):
            if imageIdName is not None:
                item.attributes[imageIdName] = idx + 1
            for anno in item.annotations:
                anno.id = annoId
                annoId += 1
        merged_dataset.save(save_dir=dst_dir, save_images=True)

        # for subsetName, subset in tqdm(merged_dataset.subsets().items(), desc='datasets'):
        #     for idx, itemId in tqdm(enumerate(itemIds), desc='items'):
        #         if imageIdName is not None:
        #             merged_dataset.get(itemId,subset=subsetName).attributes[imageIdName] = idx+1
        #         for anno in merged_dataset.get(itemId, subset=subsetName).annotations:
        #             anno.id = annoId
        #             annoId += 1
        #     merged_dataset.save(save_dir=dst_dir, save_images=True)
        return self