Exemple #1
0
def process_cli(mets_url, **kwargs):
    """
    Execute OCR-D processors for a METS file directly.
    """
    resolver = Resolver()
    workspace = resolver.workspace_from_url(mets_url)

    cmds = []
    for ocrd_tool_file in kwargs['ocrd_tool']:
        with codecs.open(ocrd_tool_file, encoding='utf-8') as f:
            obj = json.loads(f.read())
            for tool in obj['tools']:
                cmds.append(tool['binary'])

    for cmd in kwargs['steps']:
        if cmd not in cmds:
            raise Exception("Tool not registered: '%s'" % cmd)

    for cmd in kwargs['steps']:
        run_cli(cmd, mets_url, resolver, workspace)

    workspace.reload_mets()

    #  print('\n'.join(k + '=' + str(kwargs[k]) for k in kwargs))
    print(workspace)
 def _sample_ws_for_overwrite(self):
     resolver = Resolver()
     with TemporaryDirectory() as tempdir:
         ws = resolver.workspace_from_nothing(directory=tempdir)
         ws.add_file('IN-GRP',
                     pageId='pID1',
                     ID='fID1',
                     mimetype='image/tiff',
                     content='CONTENT',
                     local_filename=join(tempdir, 'ID1.tif'))
         ws.add_file('OUT-GRP',
                     pageId='pID2',
                     ID='fID2',
                     mimetype='image/tiff',
                     content='CONTENT',
                     local_filename=join(tempdir, 'ID2.tif'))
         ws.add_file('OUT-GRP',
                     pageId='pID3',
                     ID='fID3',
                     mimetype='image/tiff',
                     content='CONTENT',
                     local_filename=join(tempdir, 'ID3.tif'))
         ws.add_file('OUT-GRP',
                     pageId='pID4',
                     ID='fID4',
                     mimetype='image/tiff',
                     content='CONTENT',
                     local_filename=join(tempdir, 'ID4.tif'))
         ws.save_mets()
         yield ws
Exemple #3
0
 def test_bulk_add_stdin(self):
     resolver = Resolver()
     with pushd_popd(tempdir=True) as wsdir:
         ws = resolver.workspace_from_nothing(directory=wsdir)
         Path(wsdir, 'BIN').mkdir()
         Path(wsdir, 'BIN/FILE_0001_BIN.IMG-wolf.png').write_text('')
         Path(wsdir, 'BIN/FILE_0002_BIN.IMG-wolf.png').write_text('')
         Path(wsdir, 'BIN/FILE_0001_BIN.xml').write_text('')
         Path(wsdir, 'BIN/FILE_0002_BIN.xml').write_text('')
         with mock_stdin(
                 'PHYS_0001 BIN FILE_0001_BIN.IMG-wolf BIN/FILE_0001_BIN.IMG-wolf.png BIN/FILE_0001_BIN.IMG-wolf.png image/png\n'
                 'PHYS_0002 BIN FILE_0002_BIN.IMG-wolf BIN/FILE_0002_BIN.IMG-wolf.png BIN/FILE_0002_BIN.IMG-wolf.png image/png\n'
                 'PHYS_0001 BIN FILE_0001_BIN BIN/FILE_0001_BIN.xml BIN/FILE_0001_BIN.xml application/vnd.prima.page+xml\n'
                 'PHYS_0002 BIN FILE_0002_BIN BIN/FILE_0002_BIN.xml BIN/FILE_0002_BIN.xml application/vnd.prima.page+xml\n'
         ):
             assert len(ws.mets.file_groups) == 0
             exit_code, out, err = self.invoke_cli(workspace_cli, [
                 'bulk-add', '-r',
                 r'(?P<pageid>.*) (?P<filegrp>.*) (?P<fileid>.*) (?P<src>.*) (?P<dest>.*) (?P<mimetype>.*)',
                 '-G', '{{ filegrp }}', '-g', '{{ pageid }}', '-i',
                 '{{ fileid }}', '-m', '{{ mimetype }}', '-u', "{{ dest }}",
                 '-'
             ])
             ws.reload_mets()
             assert len(ws.mets.file_groups) == 1
             assert len(list(ws.mets.find_files())) == 4
             f = next(ws.mets.find_files())
             assert f.mimetype == 'image/png'
             assert f.ID == 'FILE_0001_BIN.IMG-wolf'
             assert f.url == 'BIN/FILE_0001_BIN.IMG-wolf.png'
Exemple #4
0
 def editable(self, editable: bool) -> None:
     if editable:
         if self._original_url:
             self.workspace = self._clone_workspace(self._original_url)
         else:
             self.workspace = Resolver().workspace_from_nothing(
                 directory=None, mets_basename='mets.xml')
     else:
         self.workspace = Resolver().workspace_from_url(self.baseurl_mets)
     self._editable = editable
Exemple #5
0
def kant_ocrdzip(ocrd_identifier):
    resolver = Resolver()
    bagger = WorkspaceBagger(resolver, strict=True)
    dest = join(gettempdir(),
                'olahd-test-bag-%d.ocrd.zip' % int(round((time() * 1000))))
    ws = resolver.workspace_from_url(
        assets.path_to('kant_aufklaerung_1784/data/mets.xml'))
    bagger.bag(ws, ocrd_identifier, dest=dest)
    yield dest
    unlink(dest)
Exemple #6
0
 def test_binarize_lines(self):
     resolver = Resolver()
     workspace = resolver.workspace_from_url(
         assets.url_of('kant_aufklaerung_1784/data/mets.xml'),
         dst_dir=WORKSPACE_DIR)
     proc = KrakenBinarize(workspace,
                           input_file_grp="OCR-D-GT-PAGE",
                           output_file_grp="OCR-D-IMG-BIN-KRAKEN",
                           parameter={'level-of-operation': 'line'})
     proc.process()
     workspace.save_mets()
Exemple #7
0
 def __init__(self, directory, mets_url, mets_basename, automatic_backup):
     self.log = getLogger('ocrd.cli.workspace')
     self.resolver = Resolver()
     if mets_basename:
         self.log.warning(
             DeprecationWarning(
                 '--mets-basename is deprecated. Use --mets/--directory instead.'
             ))
     self.directory, self.mets_url, self.mets_basename = self.resolver.resolve_mets_arguments(
         directory, mets_url, mets_basename)
     self.automatic_backup = automatic_backup
Exemple #8
0
 def runTest(self):
     resolver = Resolver()
     workspace = resolver.workspace_from_url(METS_HEROLD_SMALL,
                                             dst_dir=WORKSPACE_DIR)
     TesserocrSegmentRegion(workspace,
                            input_file_grp="OCR-D-IMG",
                            output_file_grp="OCR-D-SEG-BLOCK").process()
     TesserocrSegmentLine(workspace,
                          input_file_grp="OCR-D-SEG-BLOCK",
                          output_file_grp="OCR-D-SEG-LINE").process()
     TesserocrSegmentWord(workspace,
                          input_file_grp="OCR-D-SEG-LINE",
                          output_file_grp="OCR-D-SEG-WORD").process()
     workspace.save_mets()
Exemple #9
0
 def test_copies_ok(self):
     with copy_of_directory(assets.url_of('SBB0000F29300010000/data')) as wsdir:
         workspace = Workspace(Resolver(), wsdir)
         input_files = workspace.mets.find_files(fileGrp='OCR-D-IMG')
         self.assertEqual(len(input_files), 3)
         output_files = workspace.mets.find_files(fileGrp='OUTPUT')
         self.assertEqual(len(output_files), 0)
         run_processor(
             DummyProcessor,
             input_file_grp='OCR-D-IMG',
             output_file_grp='OUTPUT',
             workspace=workspace
         )
         output_files = workspace.mets.find_files(fileGrp='OUTPUT')
         output_files.sort(key=lambda x: x.url)
         print([str(s) for s in output_files])
         self.assertEqual(output_files[0].url, 'OUTPUT/OUTPUT_0001.tif')
         self.assertEqual(output_files[1].url, 'OUTPUT/OUTPUT_0001.xml')
         self.assertEqual(page_from_file(output_files[1]).pcGtsId, output_files[1].ID)
         self.assertEqual(page_from_file(output_files[1]).get_Page().imageFilename, output_files[0].url)
         self.assertEqual(len(output_files), 6)
         self.assertEqual(len(workspace.mets.find_files(ID='//OUTPUT.*')), 6)
         self.assertEqual(len(workspace.mets.find_files(ID='//OUTPUT.*_PAGE')), 3)
         self.assertEqual(len(workspace.mets.find_files(fileGrp='OUTPUT', mimetype=MIMETYPE_PAGE)), 3)
         run_processor(
             DummyProcessor,
             input_file_grp='OUTPUT',
             output_file_grp='OUTPUT2',
             workspace=workspace
         )
         output2_files = workspace.mets.find_files(fileGrp='OUTPUT2')
         output2_files.sort(key=lambda x: x.url)
         self.assertEqual(len(output2_files), 3)
Exemple #10
0
 def __init__(self, directory, mets_url, mets_basename, automatic_backup):
     self.log = getLogger('ocrd.cli.workspace')
     if mets_basename and mets_url:
         raise ValueError("Use either --mets or --mets-basename, not both")
     if mets_basename and not mets_url:
         self.log.warning(DeprecationWarning("--mets-basename is deprecated. Use --mets/--directory instead"))
     mets_basename = mets_basename if mets_basename else 'mets.xml'
     if directory and mets_url:
         directory = abspath(directory)
         if not abspath(mets_url).startswith(directory):
             raise ValueError("--mets has a directory part inconsistent with --directory")
     elif not directory and mets_url:
         if mets_url.startswith('http') or mets_url.startswith('https:'):
             raise ValueError("--mets is an http(s) URL but no --directory was given")
         directory = dirname(abspath(mets_url)) or getcwd()
     elif directory and not mets_url:
         directory = abspath(directory)
         mets_url = join(directory, mets_basename)
     else:
         directory = getcwd()
         mets_url = join(directory, mets_basename)
     self.directory = directory
     self.resolver = Resolver()
     self.mets_url = mets_url
     self.automatic_backup = automatic_backup
Exemple #11
0
def validate_process(tasks, workspace):
    '''
    Validate a sequence of tasks passable to 'ocrd process'
    '''
    if workspace:
        _inform_of_result(
            validate_tasks([ProcessorTask.parse(t) for t in tasks],
                           Workspace(Resolver(), directory=workspace)))
    else:
        for t in [ProcessorTask.parse(t) for t in tasks]:
            _inform_of_result(t.validate())
Exemple #12
0
 def process(self):
     client = OlaHdClient(self.parameter['endpoint'],
                          self.parameter['username'],
                          self.parameter['password'])
     bagger = WorkspaceBagger(Resolver(), strict=True)
     # TODO
     dest = join(gettempdir(),
                 'bag-%d.ocrd.zip' % int(round((time() * 1000))))
     # TODO
     ocrd_identifier = self.workspace.mets.unique_identifier
     bagger.bag(self.workspace, ocrd_identifier, dest=dest)
     client.login()
     client.post(dest, prev_pid=ocrd_identifier)
Exemple #13
0
 def _clone_workspace(cls, mets_url: Union[Path, str]) -> Workspace:
     """
     Clones a workspace (mets.xml and all used files) to a temporary directory for editing
     """
     log = getLogger(
         'ocrd_browser.model.document.Document._clone_workspace')
     mets_url = cls._strip_local(mets_url, disallow_remote=False)
     temporary_workspace = mkdtemp(prefix='browse-ocrd-clone-')
     cls.temporary_workspaces.append(temporary_workspace)
     # TODO download = False and lazy loading would be nice for responsiveness
     log.info("Cloning '%s' to '%s'", mets_url, temporary_workspace)
     workspace = Resolver().workspace_from_url(mets_url=mets_url,
                                               dst_dir=temporary_workspace,
                                               download=True)
     return workspace
Exemple #14
0
    def save_as(self,
                mets_url: Union[Path, str],
                backup_directory: Union[bool, Path, str] = True) -> None:
        log = getLogger('ocrd_browser.model.document.Document.save_as')
        mets_path = Path(self._strip_local(mets_url, disallow_remote=True))

        workspace_directory = mets_path.parent
        if workspace_directory.exists():
            if backup_directory:
                if isinstance(backup_directory, bool):
                    backup_directory = self._derive_backup_directory(
                        workspace_directory)
                shutil.move(str(workspace_directory), str(backup_directory))
            else:
                shutil.rmtree(str(workspace_directory))

        mets_basename = mets_path.name
        workspace_directory.mkdir(parents=True, exist_ok=True)
        self._emit('document_saving', 0, None)

        saved_space = Resolver().workspace_from_url(
            mets_url=self.workspace.mets_target,
            mets_basename=mets_basename,
            download=False,
            clobber_mets=True,
            dst_dir=workspace_directory)
        saved_files = list(saved_space.mets.find_files())
        for n, f in enumerate(saved_files):
            f = saved_space.download_file(f)
            self._emit('document_saving', n / len(saved_files), f)

        self._emit('document_saving', 1, None)
        self._emit('document_saved', Document(saved_space, self.emitter))
        self._original_url = str(mets_path)
        self._modified = False
        log.info('Saved to %s', self._original_url)
Exemple #15
0
    def load(cls,
             mets_url: Union[Path, str] = None,
             emitter: EventCallBack = None) -> 'Document':
        """
        Load a project from an url as a readonly view

        If you want to modify the Workspace, use Document.clone instead
        """
        if not mets_url:
            return cls.create(emitter=emitter)
        mets_url = cls._strip_local(mets_url)

        workspace = Resolver().workspace_from_url(mets_url, download=False)
        doc = cls(workspace, emitter=emitter, original_url=mets_url)
        doc._empty = False
        return doc
Exemple #16
0
    def testProcessorProfiling(self):
        initLogging()
        log_capture_string = FIFOIO(256)
        ch = logging.StreamHandler(log_capture_string)
        ch.setFormatter(logging.Formatter(LOG_FORMAT))
        getLogger('ocrd.process.profile').setLevel('DEBUG')
        getLogger('ocrd.process.profile').addHandler(ch)

        run_processor(DummyProcessor, resolver=Resolver(), mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'))

        log_contents = log_capture_string.getvalue()
        log_capture_string.close()
        # with open('/tmp/debug.log', 'w') as f:
        #     f.write(log_contents)
        # Check whether profile information has been logged. Dummy should finish in under 0.1s
        self.assertTrue(match(r'.*Executing processor \'ocrd-test\' took 0.\d+s.*', log_contents))
Exemple #17
0
class TestXsdValidator(TestCase):
    def setUp(self):
        self.resolver = Resolver()
        self.ws = self.resolver.workspace_from_url(
            assets.url_of('SBB0000F29300010000/data/mets.xml'))

    def test_constructor(self):
        with self.assertRaisesRegex(Exception, 'schema not bundled'):
            XsdValidator('foo')
        XsdValidator(XSD_METS_URL)

    def test_mets_empty(self):
        with TemporaryDirectory() as tempdir:
            mets_path = Path(tempdir, 'mets.xml')
            mets_path.write_bytes(METS_XML_EMPTY)
            report = XsdMetsValidator.validate(mets_path)
            self.assertEqual(len(report.errors), 2)
            self.assertEqual(
                report.errors[0],
                "Line 3: Element '{http://www.loc.gov/METS/}metsHdr', attribute 'CREATEDATE': '{{ NOW }}' is not a valid value of the atomic type 'xs:dateTime'."
            )
            self.assertEqual(
                report.errors[1],
                "Line 18: Element '{http://www.loc.gov/METS/}fileSec': Missing child element(s). Expected is ( {http://www.loc.gov/METS/}fileGrp )."
            )
            self.assertFalse(report.is_valid)

    def test_validate_simple_protected_str(self):
        val = XsdValidator(XSD_METS_URL)
        report = val._validate(self.ws.mets.to_xml())
        self.assertTrue(report.is_valid)

    def test_validate_simple_protected_doc(self):
        val = XsdValidator(XSD_METS_URL)
        report = val._validate(self.ws.mets._tree)
        self.assertTrue(report.is_valid)

    def test_validate_simple_static_doc(self):
        report = XsdValidator.validate(XSD_METS_URL, self.ws.mets._tree)
        self.assertTrue(report.is_valid)
Exemple #18
0
 def setUp(self):
     self.maxDiff = None
     self.resolver = Resolver()
     initLogging()
     self.runner = CliRunner()
 def setUp(self):
     self.model_path = Path(Path.cwd(), 'models/latest_net_G.pth')
     self.resolver = Resolver()
Exemple #20
0
class TestCli(TestCase):
    def setUp(self):
        super().setUp()
        disableLogging()
        self.maxDiff = None
        self.resolver = Resolver()
        self.runner = CliRunner(mix_stderr=False)

    def test_add(self):
        """
        Ensure that `ocrd workspace add` does the right thing
        """
        ID = 'foo123file'
        page_id = 'foo123page'
        file_grp = 'TEST_GROUP'
        content = 'x'
        mimetype = 'image/tiff'
        local_filename = join(file_grp, 'foo.xml')

        #  mets_api = None
        #  mets_cli = None

        with TemporaryDirectory() as tempdir:
            ws_api = self.resolver.workspace_from_nothing(directory=tempdir)
            ws_api.add_file(file_grp,
                            ID=ID,
                            content=content,
                            pageId=page_id,
                            mimetype=mimetype,
                            local_filename=local_filename)
            ws_api.save_mets()
            #  mets_api = ws_api.mets.to_xml().decode('utf8')

        with TemporaryDirectory() as tempdir:
            ws_api = self.resolver.workspace_from_nothing(directory=tempdir)
            content_file = join(tempdir, 'testfile')
            with open(content_file, 'w') as f:
                f.write(content)
                result = self.runner.invoke(workspace_cli, [
                    '-d', tempdir, 'add', '--file-grp', file_grp, '--page-id',
                    page_id, '--file-id', ID, '--mimetype', mimetype,
                    content_file
                ])
                self.assertEqual(result.exit_code, 0)
                # TODO too complex to compare :(
                #  with open(join(tempdir, 'mets.xml')) as f:
                #      mets_cli = f.read()
                #  print(mets_api)
                #  print(mets_cli)
                #  self.assertEqual(mets_api, mets_cli)
                #  print(result.output)
                #  with open(join(tempdir, 'mets.xml')) as f:
                #      print(f.read())
                self.assertEqual(result.exit_code, 0)

    def test_add_remove(self):
        ID = 'foo123file'
        page_id = 'foo123page'
        file_grp = 'TEST_GROUP'
        content = 'x'
        mimetype = 'image/tiff'
        with TemporaryDirectory() as tempdir:
            content_file = join(tempdir, 'testfile')
            with open(content_file, 'w') as f:
                f.write(content)

            result = self.runner.invoke(workspace_cli, ['init', tempdir])
            self.assertEqual(result.exit_code, 0)

            result = self.runner.invoke(workspace_cli, [
                '-d', tempdir, 'add', '--file-grp', file_grp, '--page-id',
                page_id, '--file-id', ID, '--mimetype', mimetype, content_file
            ])
            self.assertEqual(result.exit_code, 0)

            result = self.runner.invoke(
                workspace_cli, ['-d', tempdir, 'remove', '--keep-file', ID])
            self.assertEqual(result.exit_code, 0)

            # File should still exist
            self.assertTrue(exists(content_file))

    def test_add_remove_force(self):
        ID = 'foo123file'
        page_id = 'foo123page'
        file_grp = 'TEST_GROUP'
        content = 'x'
        mimetype = 'image/tiff'
        with TemporaryDirectory() as tempdir:
            content_file = join(tempdir, 'testfile')
            with open(content_file, 'w') as f:
                f.write(content)

            result = self.runner.invoke(workspace_cli, ['init', tempdir])
            self.assertEqual(result.exit_code, 0)

            result = self.runner.invoke(workspace_cli, [
                '-d', tempdir, 'add', '--file-grp', file_grp, '--page-id',
                page_id, '--file-id', ID, '--mimetype', mimetype, content_file
            ])
            self.assertEqual(result.exit_code, 0)

            result = self.runner.invoke(
                workspace_cli, ['-d', tempdir, 'remove', '--force', ID])
            self.assertEqual(result.exit_code, 0)

            # File should have been deleted
            self.assertFalse(exists(content_file))

    def test_add_url(self):
        ID = 'foo123file'
        page_id = 'foo123page'
        file_grp = 'TEST_GROUP'
        mimetype = 'image/tiff'
        url = 'http://remote/file.tif'
        with TemporaryDirectory() as tempdir:
            ws = self.resolver.workspace_from_nothing(directory=tempdir)
            ws.save_mets()
            result = self.runner.invoke(workspace_cli, [
                '-d', tempdir, 'add', '--file-grp', file_grp, '--page-id',
                page_id, '--file-id', ID, '--mimetype', mimetype, url
            ])
            self.assertEqual(result.exit_code, 0)
            ws.reload_mets()
            f = ws.mets.find_all_files()[0]
            self.assertEqual(f.url, url)

    def test_add_nonexisting_checked(self):
        ID = 'foo123file'
        page_id = 'foo123page'
        file_grp = 'TEST_GROUP'
        mimetype = 'image/tiff'
        with pushd_popd(tempdir=True) as tempdir:
            ws = self.resolver.workspace_from_nothing(directory=tempdir)
            ws.save_mets()
            exit_code, out, err = self.invoke_cli(workspace_cli, [
                '-d', tempdir, 'add', '-C', '--file-grp', file_grp,
                '--page-id', page_id, '--file-id', ID, '--mimetype', mimetype,
                'does-not-exist.xml'
            ])
            self.assertEqual(exit_code, 1)
            self.assertIn(
                "File 'does-not-exist.xml' does not exist, halt execution!",
                err)

    def test_add_519(self):
        """
        https://github.com/OCR-D/core/issues/519
        """
        with TemporaryDirectory() as tempdir:
            wsdir = Path(tempdir, "workspace")
            wsdir.mkdir()
            srcdir = Path(tempdir, "source")
            srcdir.mkdir()
            srcfile = Path(srcdir, "srcfile.jpg")
            srcfile_content = 'foo'
            srcfile.write_text(srcfile_content)
            with pushd_popd(str(wsdir)):
                exit_code, out, err = self.invoke_cli(workspace_cli, ['init'])
                exit_code, out, err = self.invoke_cli(workspace_cli, [
                    'add', '-m', 'image/jpg', '-G', 'MAX', '-i',
                    'IMG_MAX_1818975', '-C',
                    str(srcfile)
                ])
                # print(out, err)
                self.assertEqual(exit_code, 0)
                self.assertTrue(Path(wsdir, 'MAX', 'srcfile.jpg').exists())
                self.assertEqual(
                    Path(wsdir, 'MAX', 'srcfile.jpg').read_text(),
                    srcfile_content)

    def test_add_existing_checked(self):
        ID = 'foo123file'
        page_id = 'foo123page'
        file_grp = 'TEST_GROUP'
        mimetype = 'image/tiff'
        with TemporaryDirectory() as tempdir:
            content_file = join(tempdir, 'test.tif')
            ws = self.resolver.workspace_from_nothing(directory=tempdir)
            ws.save_mets()
            with open(content_file, 'w') as f:
                f.write('x')
            result = self.runner.invoke(workspace_cli, [
                '-d', tempdir, 'add', '-C', '--file-grp', file_grp,
                '--page-id', page_id, '--file-id', ID, '--mimetype', mimetype,
                content_file
            ])
            self.assertEqual(result.exit_code, 0)
            ws.reload_mets()
            f = ws.mets.find_all_files()[0]
            self.assertEqual(f.url, 'test.tif')

    def test_find_all_files(self):
        with TemporaryDirectory() as tempdir:
            wsdir = join(tempdir, 'ws')
            copytree(assets.path_to('SBB0000F29300010000/data'), wsdir)
            with pushd_popd(wsdir):
                result = self.runner.invoke(
                    workspace_cli,
                    ['find', '-G', 'OCR-D-IMG-BIN', '-k', 'fileGrp'])
                self.assertEqual(result.output,
                                 'OCR-D-IMG-BIN\nOCR-D-IMG-BIN\n')
                self.assertEqual(result.exit_code, 0)

    def test_prune_files(self):
        with TemporaryDirectory() as tempdir:
            copytree(assets.path_to('SBB0000F29300010000/data'),
                     join(tempdir, 'ws'))

            ws1 = self.resolver.workspace_from_url(
                join(tempdir, 'ws', 'mets.xml'))
            self.assertEqual(len(ws1.mets.find_all_files()), 35)

            result = self.runner.invoke(
                workspace_cli, ['-d', join(tempdir, 'ws'), 'prune-files'])
            self.assertEqual(result.exit_code, 0)

            ws2 = self.resolver.workspace_from_url(
                join(tempdir, 'ws', 'mets.xml'))
            self.assertEqual(len(ws2.mets.find_all_files()), 7)

    def test_clone_into_nonexisting_dir(self):
        """
        https://github.com/OCR-D/core/issues/330
        """
        with TemporaryDirectory() as tempdir:
            clone_to = join(tempdir, 'non-existing-dir')
            result = self.runner.invoke(workspace_cli, [
                'clone', '--download',
                assets.path_to('scribo-test/data/mets.xml'), clone_to
            ])
            self.assertEqual(result.exit_code, 0)

    def test_remove_file_group(self):
        """
        Test removal of filegrp
        """
        with TemporaryDirectory() as tempdir:
            wsdir = join(tempdir, 'ws')
            copytree(assets.path_to('SBB0000F29300010000/data'), wsdir)
            file_group = 'OCR-D-GT-PAGE'
            file_path = Path(tempdir, 'ws', file_group,
                             'FILE_0002_FULLTEXT.xml')
            self.assertTrue(file_path.exists())

            workspace = self.resolver.workspace_from_url(
                join(wsdir, 'mets.xml'))
            self.assertEqual(workspace.directory, wsdir)

            with self.assertRaisesRegex(Exception, "not empty"):
                workspace.remove_file_group(file_group)

            self.assertTrue(file_path.exists())
            self.assertEqual(len(workspace.mets.file_groups), 17)
            self.assertEqual(len(workspace.mets.find_all_files()), 35)

            workspace.remove_file_group(file_group, recursive=True, force=True)

            self.assertEqual(len(workspace.mets.file_groups), 16)
            self.assertEqual(len(workspace.mets.find_all_files()), 33)
            self.assertFalse(file_path.exists())

            # TODO ensure empty dirs are removed
            # self.assertFalse(file_path.parent.exists())

    def test_clone_relative(self):
        # Create a relative path to trigger make sure #319 is gone
        src_path = str(
            Path(assets.path_to(
                'kant_aufklaerung_1784/data/mets.xml')).relative_to(
                    Path.cwd()))
        with TemporaryDirectory() as tempdir:
            result = self.runner.invoke(workspace_cli,
                                        ['clone', '-a', src_path, tempdir])
            self.assertEqual(result.exit_code, 0)
            self.assertTrue(
                exists(join(tempdir, 'OCR-D-GT-PAGE/PAGE_0017_PAGE.xml')))

    def test_copy_vs_clone(self):
        src_dir = assets.path_to('kant_aufklaerung_1784/data')
        with TemporaryDirectory() as tempdir:
            # cloned without download
            shallowcloneddir = join(tempdir, 'cloned-shallow')
            # cloned with download
            fullcloneddir = join(tempdir, 'cloned-all')
            # copied
            copieddir = join(tempdir, 'copied')

            Path(fullcloneddir).mkdir()
            Path(shallowcloneddir).mkdir()

            result = self.runner.invoke(
                workspace_cli,
                ['clone', join(src_dir, 'mets.xml'), shallowcloneddir])
            self.assertEqual(result.exit_code, 0)

            result = self.runner.invoke(
                workspace_cli,
                ['clone', '-a',
                 join(src_dir, 'mets.xml'), fullcloneddir])
            self.assertEqual(result.exit_code, 0)

            with copy_of_directory(src_dir, copieddir):
                shallow_vs_copied = dircmp(shallowcloneddir, copieddir)
                self.assertEqual(
                    set(shallow_vs_copied.right_only),
                    set(['OCR-D-GT-ALTO', 'OCR-D-GT-PAGE', 'OCR-D-IMG']))

                full_vs_copied = dircmp(fullcloneddir, copieddir)
                #  print(full_vs_copied)
                #  from ocrd_utils import pushd_popd
                #  with pushd_popd(tempdir):
                #  import os
                #  os.system("diff %s/mets.xml %s/mets.xml" % (fullcloneddir, copieddir))
                # XXX mets.xml will not have the exact same content because
                # URLs that are actually files will be marked up as such with
                # @LOCTYPE/@OTHERLOCTYPE
                #  self.assertEqual(full_vs_copied.diff_files, [])
                self.assertEqual(full_vs_copied.left_only, [])
                self.assertEqual(full_vs_copied.right_only, [])

    def test_find_all_files_multiple_physical_pages_for_fileids(self):
        with copy_of_directory(
                assets.path_to('SBB0000F29300010000/data')) as tempdir:
            result = self.runner.invoke(workspace_cli, [
                '-d', tempdir, 'find', '--page-id', 'PHYS_0005,PHYS_0005',
                '-k', 'url'
            ])
            self.assertEqual(result.stdout, 'OCR-D-IMG/FILE_0005_IMAGE.tif\n')
            self.assertEqual(result.exit_code, 0)
            result = self.runner.invoke(workspace_cli, [
                '-d', tempdir, 'find', '--page-id', 'PHYS_0005,PHYS_0001',
                '-k', 'url'
            ])
            self.assertEqual(len(result.stdout.split('\n')), 19)

    def test_mets_basename(self):
        with TemporaryDirectory() as tempdir:
            with pushd_popd(tempdir):
                result = self.runner.invoke(workspace_cli,
                                            ['-m', 'foo.xml', 'init'])
                self.assertEqual(result.exit_code, 0)
                self.assertTrue(exists('foo.xml'))
                self.assertFalse(exists('mets.xml'))

    def test_mets_basename_and_mets(self):
        with pushd_popd(tempdir=True) as tempdir:
            with self.assertRaisesRegex(
                    ValueError,
                    "Use either --mets or --mets-basename, not both"):
                self.invoke_cli(workspace_cli,
                                ['-m', 'foo.xml', '-M', 'not-foo.xml', 'init'])

    def test_mets_basename_and_not_mets(self):
        with pushd_popd(tempdir=True) as tempdir:
            _, out, err = self.invoke_cli(
                workspace_cli, ['-d', 'foo', '-M', 'not-foo.xml', 'init'])
            self.assertEqual(out, join(tempdir, 'foo') + '\n')
            self.assertIn(
                '--mets-basename is deprecated. Use --mets/--directory instead',
                err)

    def test_mets_get_id_set_id(self):
        with pushd_popd(tempdir=True):
            self.invoke_cli(workspace_cli, ['init'])
            disableLogging()
            mets_id = 'foo123'
            self.invoke_cli(workspace_cli, ['set-id', mets_id])
            disableLogging()
            _, out, _ = self.invoke_cli(workspace_cli, ['get-id'])
            self.assertEqual(out, mets_id + '\n')

    def test_mets_directory_incompatible(self):
        with pushd_popd(tempdir=True) as tempdir:
            with self.assertRaisesRegex(
                    ValueError,
                    "--mets has a directory part inconsistent with --directory"
            ):
                self.invoke_cli(workspace_cli,
                                ['-d', 'foo', '-m', '/somewhere/else', 'init'])

    def test_mets_directory_html(self):
        with pushd_popd(tempdir=True) as tempdir:
            with self.assertRaisesRegex(
                    ValueError,
                    r"--mets is an http\(s\) URL but no --directory was given"
            ):
                self.invoke_cli(workspace_cli,
                                ['-m', 'https://foo.bar/bla', 'init'])

    def test_bulk_add(self):
        NO_FILES = 100
        with TemporaryDirectory() as srcdir:
            Path(srcdir, "OCR-D-IMG").mkdir()
            Path(srcdir, "OCR-D-PAGE").mkdir()
            for i in range(NO_FILES):
                Path(srcdir, "OCR-D-IMG", "page_%04d.tif" % i).write_text('')
            for i in range(NO_FILES):
                Path(srcdir, "OCR-D-PAGE", "page_%04d.xml" % i).write_text('')
            with TemporaryDirectory() as wsdir:
                with pushd_popd(wsdir):
                    ws = self.resolver.workspace_from_nothing(directory=wsdir)
                    exit_code, out, err = self.invoke_cli(
                        workspace_cli, [
                            'bulk-add', '--ignore', '--regex',
                            r'^.*/(?P<fileGrp>[^/]+)/page_(?P<pageid>.*)\.(?P<ext>[^\.]*)$',
                            '--url',
                            '{{ fileGrp }}/FILE_{{ pageid }}.{{ ext }}',
                            '--file-id', 'FILE_{{ fileGrp }}_{{ pageid }}',
                            '--page-id', 'PHYS_{{ pageid }}', '--file-grp',
                            '{{ fileGrp }}',
                            '%s/*/*' % srcdir
                        ])
                    # print('exit_code', exit_code)
                    # print('out', out)
                    # print('err', err)
                    ws.reload_mets()
                    self.assertEqual(len(ws.mets.file_groups), 2)
                    self.assertEqual(len(ws.mets.find_all_files()),
                                     2 * NO_FILES)
                    self.assertEqual(
                        len(ws.mets.find_all_files(mimetype='image/tiff')),
                        NO_FILES)
                    self.assertEqual(
                        len(ws.mets.find_all_files(
                            ID='//FILE_OCR-D-IMG_000.*')), 10)
                    self.assertEqual(
                        len(ws.mets.find_all_files(ID='//FILE_.*_000.*')), 20)
                    self.assertEqual(
                        len(ws.mets.find_all_files(pageId='PHYS_0001')), 2)
                    self.assertEqual(
                        ws.mets.find_all_files(
                            ID='FILE_OCR-D-PAGE_0001')[0].url,
                        'OCR-D-PAGE/FILE_0001.xml')
 def setUp(self):
     self.resolver = Resolver()
Exemple #22
0
 def __init__(self, directory, mets_basename, automatic_backup):
     self.directory = directory
     self.resolver = Resolver()
     self.mets_basename = mets_basename
     self.automatic_backup = automatic_backup
def resolver():
    return Resolver()
Exemple #24
0
#TODO PAGE-XMl

MIME_TO_EXT = {
    MIMETYPE_PAGE: ".xml",
    "application/pdf": ".pdf",
    "image/tiff": ".tif",
    "image/tif": ".tif",
    "image/jp2": ".jp2",
    "image/png": ".png",
    "image/jpg": ".jpg",
    "image/jpeg": ".jpg",
    "application/alto+xml": ".xml",
}

resolver = Resolver()

DOCS_REPO = Path(__file__).resolve(True).parent
UPDATE_BAGIT_SCRIPT = Path(DOCS_REPO, 'update-bagit')


def update_checksums(bagdir):
    with pushd_popd(bagdir):
        os.system('zsh "%s"' % UPDATE_BAGIT_SCRIPT)


resolver = Resolver()


def do_the_update(bagdir, non_local_urls=False):
    directory = Path(bagdir, 'data')
Exemple #25
0
class TestCli(TestCase):
    def setUp(self):
        self.maxDiff = None
        self.resolver = Resolver()
        initLogging()
        self.runner = CliRunner()

    def test_add(self):
        """
        Ensure that `ocrd workspace add` does the right thing
        """
        ID = 'foo123file'
        page_id = 'foo123page'
        file_grp = 'TEST_GROUP'
        content = 'x'
        mimetype = 'image/tiff'
        local_filename = join(file_grp, 'foo.xml')

        #  mets_api = None
        #  mets_cli = None

        with TemporaryDirectory() as tempdir:
            ws_api = self.resolver.workspace_from_nothing(directory=tempdir)
            ws_api.add_file(file_grp,
                            ID=ID,
                            content=content,
                            pageId=page_id,
                            mimetype=mimetype,
                            local_filename=local_filename)
            ws_api.save_mets()
            #  mets_api = ws_api.mets.to_xml().decode('utf8')

        with TemporaryDirectory() as tempdir:
            ws_api = self.resolver.workspace_from_nothing(directory=tempdir)
            content_file = join(tempdir, 'testfile')
            with open(content_file, 'w') as f:
                f.write(content)
                result = self.runner.invoke(workspace_cli, [
                    '-d', tempdir, 'add', '--file-grp', file_grp, '--page-id',
                    page_id, '--file-id', ID, '--mimetype', mimetype,
                    content_file
                ])
                self.assertEqual(result.exit_code, 0)
                # TODO too complex to compare :(
                #  with open(join(tempdir, 'mets.xml')) as f:
                #      mets_cli = f.read()
                #  print(mets_api)
                #  print(mets_cli)
                #  self.assertEqual(mets_api, mets_cli)
                #  print(result.output)
                #  with open(join(tempdir, 'mets.xml')) as f:
                #      print(f.read())
                self.assertEqual(result.exit_code, 0)

    def test_add_remove(self):
        ID = 'foo123file'
        page_id = 'foo123page'
        file_grp = 'TEST_GROUP'
        content = 'x'
        mimetype = 'image/tiff'
        with TemporaryDirectory() as tempdir:
            content_file = join(tempdir, 'testfile')
            with open(content_file, 'w') as f:
                f.write(content)

            result = self.runner.invoke(workspace_cli, ['init', tempdir])
            self.assertEqual(result.exit_code, 0)

            result = self.runner.invoke(workspace_cli, [
                '-d', tempdir, 'add', '--file-grp', file_grp, '--page-id',
                page_id, '--file-id', ID, '--mimetype', mimetype, content_file
            ])
            self.assertEqual(result.exit_code, 0)

            result = self.runner.invoke(
                workspace_cli, ['-d', tempdir, 'remove', '--keep-file', ID])
            self.assertEqual(result.exit_code, 0)

            # File should still exist
            self.assertTrue(exists(content_file))

    def test_add_remove_force(self):
        ID = 'foo123file'
        page_id = 'foo123page'
        file_grp = 'TEST_GROUP'
        content = 'x'
        mimetype = 'image/tiff'
        with TemporaryDirectory() as tempdir:
            content_file = join(tempdir, 'testfile')
            with open(content_file, 'w') as f:
                f.write(content)

            result = self.runner.invoke(workspace_cli, ['init', tempdir])
            self.assertEqual(result.exit_code, 0)

            result = self.runner.invoke(workspace_cli, [
                '-d', tempdir, 'add', '--file-grp', file_grp, '--page-id',
                page_id, '--file-id', ID, '--mimetype', mimetype, content_file
            ])
            self.assertEqual(result.exit_code, 0)

            result = self.runner.invoke(
                workspace_cli, ['-d', tempdir, 'remove', '--force', ID])
            print(result)
            print(result.output)
            self.assertEqual(result.exit_code, 0)

            # File should have been deleted
            self.assertFalse(exists(content_file))

    def test_find_files(self):
        with TemporaryDirectory() as tempdir:
            wsdir = join(tempdir, 'ws')
            copytree(assets.path_to('SBB0000F29300010000/data'), wsdir)
            with pushd_popd(wsdir):
                result = self.runner.invoke(
                    workspace_cli,
                    ['find', '-G', 'OCR-D-IMG-BIN', '-k', 'fileGrp'])
                self.assertEqual(result.output,
                                 'OCR-D-IMG-BIN\nOCR-D-IMG-BIN\n')
                self.assertEqual(result.exit_code, 0)

    def test_prune_files(self):
        with TemporaryDirectory() as tempdir:
            copytree(assets.path_to('SBB0000F29300010000/data'),
                     join(tempdir, 'ws'))

            ws1 = self.resolver.workspace_from_url(
                join(tempdir, 'ws', 'mets.xml'))
            self.assertEqual(len(ws1.mets.find_files()), 35)

            result = self.runner.invoke(
                workspace_cli, ['-d', join(tempdir, 'ws'), 'prune-files'])
            self.assertEqual(result.exit_code, 0)

            ws2 = self.resolver.workspace_from_url(
                join(tempdir, 'ws', 'mets.xml'))
            self.assertEqual(len(ws2.mets.find_files()), 7)

    def test_clone_into_nonexisting_dir(self):
        """
        https://github.com/OCR-D/core/issues/330
        """
        with TemporaryDirectory() as tempdir:
            clone_to = join(tempdir, 'non-existing-dir')
            result = self.runner.invoke(workspace_cli, [
                'clone', '--download',
                assets.path_to('scribo-test/data/mets.xml'), clone_to
            ])
            self.assertEqual(result.exit_code, 0)

    def test_remove_file_group(self):
        """
        Test removal of filegrp
        """
        with TemporaryDirectory() as tempdir:
            wsdir = join(tempdir, 'ws')
            copytree(assets.path_to('SBB0000F29300010000/data'), wsdir)
            file_group = 'OCR-D-GT-PAGE'
            file_path = Path(tempdir, 'ws', file_group,
                             'FILE_0002_FULLTEXT.xml')
            self.assertTrue(file_path.exists())

            workspace = self.resolver.workspace_from_url(
                join(wsdir, 'mets.xml'))
            self.assertEqual(workspace.directory, wsdir)

            with self.assertRaisesRegex(Exception, "not empty"):
                workspace.remove_file_group(file_group)

            self.assertTrue(file_path.exists())
            self.assertEqual(len(workspace.mets.file_groups), 17)
            self.assertEqual(len(workspace.mets.find_files()), 35)

            workspace.remove_file_group(file_group, recursive=True, force=True)

            self.assertEqual(len(workspace.mets.file_groups), 16)
            self.assertEqual(len(workspace.mets.find_files()), 33)
            self.assertFalse(file_path.exists())

            # TODO ensure empty dirs are removed
            # self.assertFalse(file_path.parent.exists())

    def test_clone_relative(self):
        # Create a relative path to trigger make sure #319 is gone
        src_path = str(
            Path(assets.path_to(
                'kant_aufklaerung_1784/data/mets.xml')).relative_to(
                    Path.cwd()))
        with TemporaryDirectory() as tempdir:
            result = self.runner.invoke(workspace_cli,
                                        ['clone', '-a', src_path, tempdir])
            self.assertEqual(result.exit_code, 0)
            self.assertTrue(
                exists(join(tempdir, 'OCR-D-GT-PAGE/PAGE_0017_PAGE.xml')))

    def test_copy_vs_clone(self):
        src_dir = assets.path_to('kant_aufklaerung_1784/data')
        with TemporaryDirectory() as tempdir:
            # cloned without download
            shallowcloneddir = join(tempdir, 'cloned-shallow')
            # cloned with download
            fullcloneddir = join(tempdir, 'cloned-all')
            # copied
            copieddir = join(tempdir, 'copied')

            Path(fullcloneddir).mkdir()
            Path(shallowcloneddir).mkdir()

            result = self.runner.invoke(
                workspace_cli,
                ['clone', join(src_dir, 'mets.xml'), shallowcloneddir])
            self.assertEqual(result.exit_code, 0)

            result = self.runner.invoke(
                workspace_cli,
                ['clone', '-a',
                 join(src_dir, 'mets.xml'), fullcloneddir])
            self.assertEqual(result.exit_code, 0)

            with copy_of_directory(src_dir, copieddir):
                shallow_vs_copied = dircmp(shallowcloneddir, copieddir)
                self.assertEqual(
                    set(shallow_vs_copied.right_only),
                    set(['OCR-D-GT-ALTO', 'OCR-D-GT-PAGE', 'OCR-D-IMG']))

                full_vs_copied = dircmp(fullcloneddir, copieddir)
                #  print(full_vs_copied)
                #  from ocrd_utils import pushd_popd
                #  with pushd_popd(tempdir):
                #  import os
                #  os.system("diff %s/mets.xml %s/mets.xml" % (fullcloneddir, copieddir))
                # XXX mets.xml will not have the exact same content because
                # URLs that are actually files will be marked up as such with
                # @LOCTYPE/@OTHERLOCTYPE
                #  self.assertEqual(full_vs_copied.diff_files, [])
                self.assertEqual(full_vs_copied.left_only, [])
                self.assertEqual(full_vs_copied.right_only, [])

    def test_mets_basename(self):
        with TemporaryDirectory() as tempdir:
            with pushd_popd(tempdir):
                result = self.runner.invoke(workspace_cli,
                                            ['-M', 'foo.xml', 'init', '.'])
                self.assertEqual(result.exit_code, 0)
                self.assertTrue(exists('foo.xml'))
                self.assertFalse(exists('mets.xml'))
 def setUp(self):
     super().setUp()
     self.resolver = Resolver()
     self.ws = self.resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/data/mets.xml'))
Exemple #27
0
 def __init__(self, directory):
     self.directory = directory
     self.resolver = Resolver(cache_enabled=True)
     self.config = {}
     self.verbose = False
Exemple #28
0
 def __init__(self, directory, mets_basename):
     self.directory = directory
     self.resolver = Resolver()
     self.mets_basename = mets_basename
     self.config = {}
     self.verbose = False
Exemple #29
0
 def setUp(self):
     super().setUp()
     disableLogging()
     self.maxDiff = None
     self.resolver = Resolver()
     self.runner = CliRunner(mix_stderr=False)
Exemple #30
0
 def setUp(self):
     self.resolver = Resolver()
     initLogging()