Example #1
0
def process_cli(mets_url, **kwargs):
    """
    Execute OCR-D processors for a METS file directly.
    """
    resolver = Resolver()
    workspace = resolver.workspace_from_url(mets_url)

    cmds = []
    for ocrd_tool_file in kwargs['ocrd_tool']:
        with codecs.open(ocrd_tool_file, encoding='utf-8') as f:
            obj = json.loads(f.read())
            for tool in obj['tools']:
                cmds.append(tool['binary'])

    for cmd in kwargs['steps']:
        if cmd not in cmds:
            raise Exception("Tool not registered: '%s'" % cmd)

    for cmd in kwargs['steps']:
        run_cli(cmd, mets_url, resolver, workspace)

    workspace.reload_mets()

    #  print('\n'.join(k + '=' + str(kwargs[k]) for k in kwargs))
    print(workspace)
Example #2
0
def kant_ocrdzip(ocrd_identifier):
    resolver = Resolver()
    bagger = WorkspaceBagger(resolver, strict=True)
    dest = join(gettempdir(),
                'olahd-test-bag-%d.ocrd.zip' % int(round((time() * 1000))))
    ws = resolver.workspace_from_url(
        assets.path_to('kant_aufklaerung_1784/data/mets.xml'))
    bagger.bag(ws, ocrd_identifier, dest=dest)
    yield dest
    unlink(dest)
Example #3
0
 def test_binarize_lines(self):
     resolver = Resolver()
     workspace = resolver.workspace_from_url(
         assets.url_of('kant_aufklaerung_1784/data/mets.xml'),
         dst_dir=WORKSPACE_DIR)
     proc = KrakenBinarize(workspace,
                           input_file_grp="OCR-D-GT-PAGE",
                           output_file_grp="OCR-D-IMG-BIN-KRAKEN",
                           parameter={'level-of-operation': 'line'})
     proc.process()
     workspace.save_mets()
Example #4
0
 def runTest(self):
     resolver = Resolver()
     workspace = resolver.workspace_from_url(METS_HEROLD_SMALL,
                                             dst_dir=WORKSPACE_DIR)
     TesserocrSegmentRegion(workspace,
                            input_file_grp="OCR-D-IMG",
                            output_file_grp="OCR-D-SEG-BLOCK").process()
     TesserocrSegmentLine(workspace,
                          input_file_grp="OCR-D-SEG-BLOCK",
                          output_file_grp="OCR-D-SEG-LINE").process()
     TesserocrSegmentWord(workspace,
                          input_file_grp="OCR-D-SEG-LINE",
                          output_file_grp="OCR-D-SEG-WORD").process()
     workspace.save_mets()
Example #5
0
class TestXsdValidator(TestCase):
    def setUp(self):
        self.resolver = Resolver()
        self.ws = self.resolver.workspace_from_url(
            assets.url_of('SBB0000F29300010000/data/mets.xml'))

    def test_constructor(self):
        with self.assertRaisesRegex(Exception, 'schema not bundled'):
            XsdValidator('foo')
        XsdValidator(XSD_METS_URL)

    def test_mets_empty(self):
        with TemporaryDirectory() as tempdir:
            mets_path = Path(tempdir, 'mets.xml')
            mets_path.write_bytes(METS_XML_EMPTY)
            report = XsdMetsValidator.validate(mets_path)
            self.assertEqual(len(report.errors), 2)
            self.assertEqual(
                report.errors[0],
                "Line 3: Element '{http://www.loc.gov/METS/}metsHdr', attribute 'CREATEDATE': '{{ NOW }}' is not a valid value of the atomic type 'xs:dateTime'."
            )
            self.assertEqual(
                report.errors[1],
                "Line 18: Element '{http://www.loc.gov/METS/}fileSec': Missing child element(s). Expected is ( {http://www.loc.gov/METS/}fileGrp )."
            )
            self.assertFalse(report.is_valid)

    def test_validate_simple_protected_str(self):
        val = XsdValidator(XSD_METS_URL)
        report = val._validate(self.ws.mets.to_xml())
        self.assertTrue(report.is_valid)

    def test_validate_simple_protected_doc(self):
        val = XsdValidator(XSD_METS_URL)
        report = val._validate(self.ws.mets._tree)
        self.assertTrue(report.is_valid)

    def test_validate_simple_static_doc(self):
        report = XsdValidator.validate(XSD_METS_URL, self.ws.mets._tree)
        self.assertTrue(report.is_valid)
Example #6
0
class TestCli(TestCase):
    def setUp(self):
        super().setUp()
        disableLogging()
        self.maxDiff = None
        self.resolver = Resolver()
        self.runner = CliRunner(mix_stderr=False)

    def test_add(self):
        """
        Ensure that `ocrd workspace add` does the right thing
        """
        ID = 'foo123file'
        page_id = 'foo123page'
        file_grp = 'TEST_GROUP'
        content = 'x'
        mimetype = 'image/tiff'
        local_filename = join(file_grp, 'foo.xml')

        #  mets_api = None
        #  mets_cli = None

        with TemporaryDirectory() as tempdir:
            ws_api = self.resolver.workspace_from_nothing(directory=tempdir)
            ws_api.add_file(file_grp,
                            ID=ID,
                            content=content,
                            pageId=page_id,
                            mimetype=mimetype,
                            local_filename=local_filename)
            ws_api.save_mets()
            #  mets_api = ws_api.mets.to_xml().decode('utf8')

        with TemporaryDirectory() as tempdir:
            ws_api = self.resolver.workspace_from_nothing(directory=tempdir)
            content_file = join(tempdir, 'testfile')
            with open(content_file, 'w') as f:
                f.write(content)
                result = self.runner.invoke(workspace_cli, [
                    '-d', tempdir, 'add', '--file-grp', file_grp, '--page-id',
                    page_id, '--file-id', ID, '--mimetype', mimetype,
                    content_file
                ])
                self.assertEqual(result.exit_code, 0)
                # TODO too complex to compare :(
                #  with open(join(tempdir, 'mets.xml')) as f:
                #      mets_cli = f.read()
                #  print(mets_api)
                #  print(mets_cli)
                #  self.assertEqual(mets_api, mets_cli)
                #  print(result.output)
                #  with open(join(tempdir, 'mets.xml')) as f:
                #      print(f.read())
                self.assertEqual(result.exit_code, 0)

    def test_add_remove(self):
        ID = 'foo123file'
        page_id = 'foo123page'
        file_grp = 'TEST_GROUP'
        content = 'x'
        mimetype = 'image/tiff'
        with TemporaryDirectory() as tempdir:
            content_file = join(tempdir, 'testfile')
            with open(content_file, 'w') as f:
                f.write(content)

            result = self.runner.invoke(workspace_cli, ['init', tempdir])
            self.assertEqual(result.exit_code, 0)

            result = self.runner.invoke(workspace_cli, [
                '-d', tempdir, 'add', '--file-grp', file_grp, '--page-id',
                page_id, '--file-id', ID, '--mimetype', mimetype, content_file
            ])
            self.assertEqual(result.exit_code, 0)

            result = self.runner.invoke(
                workspace_cli, ['-d', tempdir, 'remove', '--keep-file', ID])
            self.assertEqual(result.exit_code, 0)

            # File should still exist
            self.assertTrue(exists(content_file))

    def test_add_remove_force(self):
        ID = 'foo123file'
        page_id = 'foo123page'
        file_grp = 'TEST_GROUP'
        content = 'x'
        mimetype = 'image/tiff'
        with TemporaryDirectory() as tempdir:
            content_file = join(tempdir, 'testfile')
            with open(content_file, 'w') as f:
                f.write(content)

            result = self.runner.invoke(workspace_cli, ['init', tempdir])
            self.assertEqual(result.exit_code, 0)

            result = self.runner.invoke(workspace_cli, [
                '-d', tempdir, 'add', '--file-grp', file_grp, '--page-id',
                page_id, '--file-id', ID, '--mimetype', mimetype, content_file
            ])
            self.assertEqual(result.exit_code, 0)

            result = self.runner.invoke(
                workspace_cli, ['-d', tempdir, 'remove', '--force', ID])
            self.assertEqual(result.exit_code, 0)

            # File should have been deleted
            self.assertFalse(exists(content_file))

    def test_add_url(self):
        ID = 'foo123file'
        page_id = 'foo123page'
        file_grp = 'TEST_GROUP'
        mimetype = 'image/tiff'
        url = 'http://remote/file.tif'
        with TemporaryDirectory() as tempdir:
            ws = self.resolver.workspace_from_nothing(directory=tempdir)
            ws.save_mets()
            result = self.runner.invoke(workspace_cli, [
                '-d', tempdir, 'add', '--file-grp', file_grp, '--page-id',
                page_id, '--file-id', ID, '--mimetype', mimetype, url
            ])
            self.assertEqual(result.exit_code, 0)
            ws.reload_mets()
            f = ws.mets.find_all_files()[0]
            self.assertEqual(f.url, url)

    def test_add_nonexisting_checked(self):
        ID = 'foo123file'
        page_id = 'foo123page'
        file_grp = 'TEST_GROUP'
        mimetype = 'image/tiff'
        with pushd_popd(tempdir=True) as tempdir:
            ws = self.resolver.workspace_from_nothing(directory=tempdir)
            ws.save_mets()
            exit_code, out, err = self.invoke_cli(workspace_cli, [
                '-d', tempdir, 'add', '-C', '--file-grp', file_grp,
                '--page-id', page_id, '--file-id', ID, '--mimetype', mimetype,
                'does-not-exist.xml'
            ])
            self.assertEqual(exit_code, 1)
            self.assertIn(
                "File 'does-not-exist.xml' does not exist, halt execution!",
                err)

    def test_add_519(self):
        """
        https://github.com/OCR-D/core/issues/519
        """
        with TemporaryDirectory() as tempdir:
            wsdir = Path(tempdir, "workspace")
            wsdir.mkdir()
            srcdir = Path(tempdir, "source")
            srcdir.mkdir()
            srcfile = Path(srcdir, "srcfile.jpg")
            srcfile_content = 'foo'
            srcfile.write_text(srcfile_content)
            with pushd_popd(str(wsdir)):
                exit_code, out, err = self.invoke_cli(workspace_cli, ['init'])
                exit_code, out, err = self.invoke_cli(workspace_cli, [
                    'add', '-m', 'image/jpg', '-G', 'MAX', '-i',
                    'IMG_MAX_1818975', '-C',
                    str(srcfile)
                ])
                # print(out, err)
                self.assertEqual(exit_code, 0)
                self.assertTrue(Path(wsdir, 'MAX', 'srcfile.jpg').exists())
                self.assertEqual(
                    Path(wsdir, 'MAX', 'srcfile.jpg').read_text(),
                    srcfile_content)

    def test_add_existing_checked(self):
        ID = 'foo123file'
        page_id = 'foo123page'
        file_grp = 'TEST_GROUP'
        mimetype = 'image/tiff'
        with TemporaryDirectory() as tempdir:
            content_file = join(tempdir, 'test.tif')
            ws = self.resolver.workspace_from_nothing(directory=tempdir)
            ws.save_mets()
            with open(content_file, 'w') as f:
                f.write('x')
            result = self.runner.invoke(workspace_cli, [
                '-d', tempdir, 'add', '-C', '--file-grp', file_grp,
                '--page-id', page_id, '--file-id', ID, '--mimetype', mimetype,
                content_file
            ])
            self.assertEqual(result.exit_code, 0)
            ws.reload_mets()
            f = ws.mets.find_all_files()[0]
            self.assertEqual(f.url, 'test.tif')

    def test_find_all_files(self):
        with TemporaryDirectory() as tempdir:
            wsdir = join(tempdir, 'ws')
            copytree(assets.path_to('SBB0000F29300010000/data'), wsdir)
            with pushd_popd(wsdir):
                result = self.runner.invoke(
                    workspace_cli,
                    ['find', '-G', 'OCR-D-IMG-BIN', '-k', 'fileGrp'])
                self.assertEqual(result.output,
                                 'OCR-D-IMG-BIN\nOCR-D-IMG-BIN\n')
                self.assertEqual(result.exit_code, 0)

    def test_prune_files(self):
        with TemporaryDirectory() as tempdir:
            copytree(assets.path_to('SBB0000F29300010000/data'),
                     join(tempdir, 'ws'))

            ws1 = self.resolver.workspace_from_url(
                join(tempdir, 'ws', 'mets.xml'))
            self.assertEqual(len(ws1.mets.find_all_files()), 35)

            result = self.runner.invoke(
                workspace_cli, ['-d', join(tempdir, 'ws'), 'prune-files'])
            self.assertEqual(result.exit_code, 0)

            ws2 = self.resolver.workspace_from_url(
                join(tempdir, 'ws', 'mets.xml'))
            self.assertEqual(len(ws2.mets.find_all_files()), 7)

    def test_clone_into_nonexisting_dir(self):
        """
        https://github.com/OCR-D/core/issues/330
        """
        with TemporaryDirectory() as tempdir:
            clone_to = join(tempdir, 'non-existing-dir')
            result = self.runner.invoke(workspace_cli, [
                'clone', '--download',
                assets.path_to('scribo-test/data/mets.xml'), clone_to
            ])
            self.assertEqual(result.exit_code, 0)

    def test_remove_file_group(self):
        """
        Test removal of filegrp
        """
        with TemporaryDirectory() as tempdir:
            wsdir = join(tempdir, 'ws')
            copytree(assets.path_to('SBB0000F29300010000/data'), wsdir)
            file_group = 'OCR-D-GT-PAGE'
            file_path = Path(tempdir, 'ws', file_group,
                             'FILE_0002_FULLTEXT.xml')
            self.assertTrue(file_path.exists())

            workspace = self.resolver.workspace_from_url(
                join(wsdir, 'mets.xml'))
            self.assertEqual(workspace.directory, wsdir)

            with self.assertRaisesRegex(Exception, "not empty"):
                workspace.remove_file_group(file_group)

            self.assertTrue(file_path.exists())
            self.assertEqual(len(workspace.mets.file_groups), 17)
            self.assertEqual(len(workspace.mets.find_all_files()), 35)

            workspace.remove_file_group(file_group, recursive=True, force=True)

            self.assertEqual(len(workspace.mets.file_groups), 16)
            self.assertEqual(len(workspace.mets.find_all_files()), 33)
            self.assertFalse(file_path.exists())

            # TODO ensure empty dirs are removed
            # self.assertFalse(file_path.parent.exists())

    def test_clone_relative(self):
        # Create a relative path to trigger make sure #319 is gone
        src_path = str(
            Path(assets.path_to(
                'kant_aufklaerung_1784/data/mets.xml')).relative_to(
                    Path.cwd()))
        with TemporaryDirectory() as tempdir:
            result = self.runner.invoke(workspace_cli,
                                        ['clone', '-a', src_path, tempdir])
            self.assertEqual(result.exit_code, 0)
            self.assertTrue(
                exists(join(tempdir, 'OCR-D-GT-PAGE/PAGE_0017_PAGE.xml')))

    def test_copy_vs_clone(self):
        src_dir = assets.path_to('kant_aufklaerung_1784/data')
        with TemporaryDirectory() as tempdir:
            # cloned without download
            shallowcloneddir = join(tempdir, 'cloned-shallow')
            # cloned with download
            fullcloneddir = join(tempdir, 'cloned-all')
            # copied
            copieddir = join(tempdir, 'copied')

            Path(fullcloneddir).mkdir()
            Path(shallowcloneddir).mkdir()

            result = self.runner.invoke(
                workspace_cli,
                ['clone', join(src_dir, 'mets.xml'), shallowcloneddir])
            self.assertEqual(result.exit_code, 0)

            result = self.runner.invoke(
                workspace_cli,
                ['clone', '-a',
                 join(src_dir, 'mets.xml'), fullcloneddir])
            self.assertEqual(result.exit_code, 0)

            with copy_of_directory(src_dir, copieddir):
                shallow_vs_copied = dircmp(shallowcloneddir, copieddir)
                self.assertEqual(
                    set(shallow_vs_copied.right_only),
                    set(['OCR-D-GT-ALTO', 'OCR-D-GT-PAGE', 'OCR-D-IMG']))

                full_vs_copied = dircmp(fullcloneddir, copieddir)
                #  print(full_vs_copied)
                #  from ocrd_utils import pushd_popd
                #  with pushd_popd(tempdir):
                #  import os
                #  os.system("diff %s/mets.xml %s/mets.xml" % (fullcloneddir, copieddir))
                # XXX mets.xml will not have the exact same content because
                # URLs that are actually files will be marked up as such with
                # @LOCTYPE/@OTHERLOCTYPE
                #  self.assertEqual(full_vs_copied.diff_files, [])
                self.assertEqual(full_vs_copied.left_only, [])
                self.assertEqual(full_vs_copied.right_only, [])

    def test_find_all_files_multiple_physical_pages_for_fileids(self):
        with copy_of_directory(
                assets.path_to('SBB0000F29300010000/data')) as tempdir:
            result = self.runner.invoke(workspace_cli, [
                '-d', tempdir, 'find', '--page-id', 'PHYS_0005,PHYS_0005',
                '-k', 'url'
            ])
            self.assertEqual(result.stdout, 'OCR-D-IMG/FILE_0005_IMAGE.tif\n')
            self.assertEqual(result.exit_code, 0)
            result = self.runner.invoke(workspace_cli, [
                '-d', tempdir, 'find', '--page-id', 'PHYS_0005,PHYS_0001',
                '-k', 'url'
            ])
            self.assertEqual(len(result.stdout.split('\n')), 19)

    def test_mets_basename(self):
        with TemporaryDirectory() as tempdir:
            with pushd_popd(tempdir):
                result = self.runner.invoke(workspace_cli,
                                            ['-m', 'foo.xml', 'init'])
                self.assertEqual(result.exit_code, 0)
                self.assertTrue(exists('foo.xml'))
                self.assertFalse(exists('mets.xml'))

    def test_mets_basename_and_mets(self):
        with pushd_popd(tempdir=True) as tempdir:
            with self.assertRaisesRegex(
                    ValueError,
                    "Use either --mets or --mets-basename, not both"):
                self.invoke_cli(workspace_cli,
                                ['-m', 'foo.xml', '-M', 'not-foo.xml', 'init'])

    def test_mets_basename_and_not_mets(self):
        with pushd_popd(tempdir=True) as tempdir:
            _, out, err = self.invoke_cli(
                workspace_cli, ['-d', 'foo', '-M', 'not-foo.xml', 'init'])
            self.assertEqual(out, join(tempdir, 'foo') + '\n')
            self.assertIn(
                '--mets-basename is deprecated. Use --mets/--directory instead',
                err)

    def test_mets_get_id_set_id(self):
        with pushd_popd(tempdir=True):
            self.invoke_cli(workspace_cli, ['init'])
            disableLogging()
            mets_id = 'foo123'
            self.invoke_cli(workspace_cli, ['set-id', mets_id])
            disableLogging()
            _, out, _ = self.invoke_cli(workspace_cli, ['get-id'])
            self.assertEqual(out, mets_id + '\n')

    def test_mets_directory_incompatible(self):
        with pushd_popd(tempdir=True) as tempdir:
            with self.assertRaisesRegex(
                    ValueError,
                    "--mets has a directory part inconsistent with --directory"
            ):
                self.invoke_cli(workspace_cli,
                                ['-d', 'foo', '-m', '/somewhere/else', 'init'])

    def test_mets_directory_html(self):
        with pushd_popd(tempdir=True) as tempdir:
            with self.assertRaisesRegex(
                    ValueError,
                    r"--mets is an http\(s\) URL but no --directory was given"
            ):
                self.invoke_cli(workspace_cli,
                                ['-m', 'https://foo.bar/bla', 'init'])

    def test_bulk_add(self):
        NO_FILES = 100
        with TemporaryDirectory() as srcdir:
            Path(srcdir, "OCR-D-IMG").mkdir()
            Path(srcdir, "OCR-D-PAGE").mkdir()
            for i in range(NO_FILES):
                Path(srcdir, "OCR-D-IMG", "page_%04d.tif" % i).write_text('')
            for i in range(NO_FILES):
                Path(srcdir, "OCR-D-PAGE", "page_%04d.xml" % i).write_text('')
            with TemporaryDirectory() as wsdir:
                with pushd_popd(wsdir):
                    ws = self.resolver.workspace_from_nothing(directory=wsdir)
                    exit_code, out, err = self.invoke_cli(
                        workspace_cli, [
                            'bulk-add', '--ignore', '--regex',
                            r'^.*/(?P<fileGrp>[^/]+)/page_(?P<pageid>.*)\.(?P<ext>[^\.]*)$',
                            '--url',
                            '{{ fileGrp }}/FILE_{{ pageid }}.{{ ext }}',
                            '--file-id', 'FILE_{{ fileGrp }}_{{ pageid }}',
                            '--page-id', 'PHYS_{{ pageid }}', '--file-grp',
                            '{{ fileGrp }}',
                            '%s/*/*' % srcdir
                        ])
                    # print('exit_code', exit_code)
                    # print('out', out)
                    # print('err', err)
                    ws.reload_mets()
                    self.assertEqual(len(ws.mets.file_groups), 2)
                    self.assertEqual(len(ws.mets.find_all_files()),
                                     2 * NO_FILES)
                    self.assertEqual(
                        len(ws.mets.find_all_files(mimetype='image/tiff')),
                        NO_FILES)
                    self.assertEqual(
                        len(ws.mets.find_all_files(
                            ID='//FILE_OCR-D-IMG_000.*')), 10)
                    self.assertEqual(
                        len(ws.mets.find_all_files(ID='//FILE_.*_000.*')), 20)
                    self.assertEqual(
                        len(ws.mets.find_all_files(pageId='PHYS_0001')), 2)
                    self.assertEqual(
                        ws.mets.find_all_files(
                            ID='FILE_OCR-D-PAGE_0001')[0].url,
                        'OCR-D-PAGE/FILE_0001.xml')
Example #7
0
class TestCli(TestCase):
    def setUp(self):
        self.maxDiff = None
        self.resolver = Resolver()
        initLogging()
        self.runner = CliRunner()

    def test_add(self):
        """
        Ensure that `ocrd workspace add` does the right thing
        """
        ID = 'foo123file'
        page_id = 'foo123page'
        file_grp = 'TEST_GROUP'
        content = 'x'
        mimetype = 'image/tiff'
        local_filename = join(file_grp, 'foo.xml')

        #  mets_api = None
        #  mets_cli = None

        with TemporaryDirectory() as tempdir:
            ws_api = self.resolver.workspace_from_nothing(directory=tempdir)
            ws_api.add_file(file_grp,
                            ID=ID,
                            content=content,
                            pageId=page_id,
                            mimetype=mimetype,
                            local_filename=local_filename)
            ws_api.save_mets()
            #  mets_api = ws_api.mets.to_xml().decode('utf8')

        with TemporaryDirectory() as tempdir:
            ws_api = self.resolver.workspace_from_nothing(directory=tempdir)
            content_file = join(tempdir, 'testfile')
            with open(content_file, 'w') as f:
                f.write(content)
                result = self.runner.invoke(workspace_cli, [
                    '-d', tempdir, 'add', '--file-grp', file_grp, '--page-id',
                    page_id, '--file-id', ID, '--mimetype', mimetype,
                    content_file
                ])
                self.assertEqual(result.exit_code, 0)
                # TODO too complex to compare :(
                #  with open(join(tempdir, 'mets.xml')) as f:
                #      mets_cli = f.read()
                #  print(mets_api)
                #  print(mets_cli)
                #  self.assertEqual(mets_api, mets_cli)
                #  print(result.output)
                #  with open(join(tempdir, 'mets.xml')) as f:
                #      print(f.read())
                self.assertEqual(result.exit_code, 0)

    def test_add_remove(self):
        ID = 'foo123file'
        page_id = 'foo123page'
        file_grp = 'TEST_GROUP'
        content = 'x'
        mimetype = 'image/tiff'
        with TemporaryDirectory() as tempdir:
            content_file = join(tempdir, 'testfile')
            with open(content_file, 'w') as f:
                f.write(content)

            result = self.runner.invoke(workspace_cli, ['init', tempdir])
            self.assertEqual(result.exit_code, 0)

            result = self.runner.invoke(workspace_cli, [
                '-d', tempdir, 'add', '--file-grp', file_grp, '--page-id',
                page_id, '--file-id', ID, '--mimetype', mimetype, content_file
            ])
            self.assertEqual(result.exit_code, 0)

            result = self.runner.invoke(
                workspace_cli, ['-d', tempdir, 'remove', '--keep-file', ID])
            self.assertEqual(result.exit_code, 0)

            # File should still exist
            self.assertTrue(exists(content_file))

    def test_add_remove_force(self):
        ID = 'foo123file'
        page_id = 'foo123page'
        file_grp = 'TEST_GROUP'
        content = 'x'
        mimetype = 'image/tiff'
        with TemporaryDirectory() as tempdir:
            content_file = join(tempdir, 'testfile')
            with open(content_file, 'w') as f:
                f.write(content)

            result = self.runner.invoke(workspace_cli, ['init', tempdir])
            self.assertEqual(result.exit_code, 0)

            result = self.runner.invoke(workspace_cli, [
                '-d', tempdir, 'add', '--file-grp', file_grp, '--page-id',
                page_id, '--file-id', ID, '--mimetype', mimetype, content_file
            ])
            self.assertEqual(result.exit_code, 0)

            result = self.runner.invoke(
                workspace_cli, ['-d', tempdir, 'remove', '--force', ID])
            print(result)
            print(result.output)
            self.assertEqual(result.exit_code, 0)

            # File should have been deleted
            self.assertFalse(exists(content_file))

    def test_find_files(self):
        with TemporaryDirectory() as tempdir:
            wsdir = join(tempdir, 'ws')
            copytree(assets.path_to('SBB0000F29300010000/data'), wsdir)
            with pushd_popd(wsdir):
                result = self.runner.invoke(
                    workspace_cli,
                    ['find', '-G', 'OCR-D-IMG-BIN', '-k', 'fileGrp'])
                self.assertEqual(result.output,
                                 'OCR-D-IMG-BIN\nOCR-D-IMG-BIN\n')
                self.assertEqual(result.exit_code, 0)

    def test_prune_files(self):
        with TemporaryDirectory() as tempdir:
            copytree(assets.path_to('SBB0000F29300010000/data'),
                     join(tempdir, 'ws'))

            ws1 = self.resolver.workspace_from_url(
                join(tempdir, 'ws', 'mets.xml'))
            self.assertEqual(len(ws1.mets.find_files()), 35)

            result = self.runner.invoke(
                workspace_cli, ['-d', join(tempdir, 'ws'), 'prune-files'])
            self.assertEqual(result.exit_code, 0)

            ws2 = self.resolver.workspace_from_url(
                join(tempdir, 'ws', 'mets.xml'))
            self.assertEqual(len(ws2.mets.find_files()), 7)

    def test_clone_into_nonexisting_dir(self):
        """
        https://github.com/OCR-D/core/issues/330
        """
        with TemporaryDirectory() as tempdir:
            clone_to = join(tempdir, 'non-existing-dir')
            result = self.runner.invoke(workspace_cli, [
                'clone', '--download',
                assets.path_to('scribo-test/data/mets.xml'), clone_to
            ])
            self.assertEqual(result.exit_code, 0)

    def test_remove_file_group(self):
        """
        Test removal of filegrp
        """
        with TemporaryDirectory() as tempdir:
            wsdir = join(tempdir, 'ws')
            copytree(assets.path_to('SBB0000F29300010000/data'), wsdir)
            file_group = 'OCR-D-GT-PAGE'
            file_path = Path(tempdir, 'ws', file_group,
                             'FILE_0002_FULLTEXT.xml')
            self.assertTrue(file_path.exists())

            workspace = self.resolver.workspace_from_url(
                join(wsdir, 'mets.xml'))
            self.assertEqual(workspace.directory, wsdir)

            with self.assertRaisesRegex(Exception, "not empty"):
                workspace.remove_file_group(file_group)

            self.assertTrue(file_path.exists())
            self.assertEqual(len(workspace.mets.file_groups), 17)
            self.assertEqual(len(workspace.mets.find_files()), 35)

            workspace.remove_file_group(file_group, recursive=True, force=True)

            self.assertEqual(len(workspace.mets.file_groups), 16)
            self.assertEqual(len(workspace.mets.find_files()), 33)
            self.assertFalse(file_path.exists())

            # TODO ensure empty dirs are removed
            # self.assertFalse(file_path.parent.exists())

    def test_clone_relative(self):
        # Create a relative path to trigger make sure #319 is gone
        src_path = str(
            Path(assets.path_to(
                'kant_aufklaerung_1784/data/mets.xml')).relative_to(
                    Path.cwd()))
        with TemporaryDirectory() as tempdir:
            result = self.runner.invoke(workspace_cli,
                                        ['clone', '-a', src_path, tempdir])
            self.assertEqual(result.exit_code, 0)
            self.assertTrue(
                exists(join(tempdir, 'OCR-D-GT-PAGE/PAGE_0017_PAGE.xml')))

    def test_copy_vs_clone(self):
        src_dir = assets.path_to('kant_aufklaerung_1784/data')
        with TemporaryDirectory() as tempdir:
            # cloned without download
            shallowcloneddir = join(tempdir, 'cloned-shallow')
            # cloned with download
            fullcloneddir = join(tempdir, 'cloned-all')
            # copied
            copieddir = join(tempdir, 'copied')

            Path(fullcloneddir).mkdir()
            Path(shallowcloneddir).mkdir()

            result = self.runner.invoke(
                workspace_cli,
                ['clone', join(src_dir, 'mets.xml'), shallowcloneddir])
            self.assertEqual(result.exit_code, 0)

            result = self.runner.invoke(
                workspace_cli,
                ['clone', '-a',
                 join(src_dir, 'mets.xml'), fullcloneddir])
            self.assertEqual(result.exit_code, 0)

            with copy_of_directory(src_dir, copieddir):
                shallow_vs_copied = dircmp(shallowcloneddir, copieddir)
                self.assertEqual(
                    set(shallow_vs_copied.right_only),
                    set(['OCR-D-GT-ALTO', 'OCR-D-GT-PAGE', 'OCR-D-IMG']))

                full_vs_copied = dircmp(fullcloneddir, copieddir)
                #  print(full_vs_copied)
                #  from ocrd_utils import pushd_popd
                #  with pushd_popd(tempdir):
                #  import os
                #  os.system("diff %s/mets.xml %s/mets.xml" % (fullcloneddir, copieddir))
                # XXX mets.xml will not have the exact same content because
                # URLs that are actually files will be marked up as such with
                # @LOCTYPE/@OTHERLOCTYPE
                #  self.assertEqual(full_vs_copied.diff_files, [])
                self.assertEqual(full_vs_copied.left_only, [])
                self.assertEqual(full_vs_copied.right_only, [])

    def test_mets_basename(self):
        with TemporaryDirectory() as tempdir:
            with pushd_popd(tempdir):
                result = self.runner.invoke(workspace_cli,
                                            ['-M', 'foo.xml', 'init', '.'])
                self.assertEqual(result.exit_code, 0)
                self.assertTrue(exists('foo.xml'))
                self.assertFalse(exists('mets.xml'))