Esempio n. 1
0
    def workspace_from_folder(self, directory, return_mets=False, clobber_mets=False, convention='ocrd-gt'):
        """
        Create a workspace from a folder, creating a METS file.

        Args:
            convention: See add_files_to_mets
            clobber_mets (boolean) : Whether to overwrite existing mets.xml. Default: False.
            return_mets (boolean) : Do not create the actual mets.xml file but return the :class:`OcrdMets`. Default: False.
        """
        if directory is None:
            raise Exception("Must pass directory")
        if not os.path.isdir(directory):
            raise Exception("Directory does not exist or is not a directory: '%s'" % directory)
        if not clobber_mets and os.path.exists(os.path.join(directory, 'mets.xml')):
            raise Exception("Not clobbering existing mets.xml in '%s'." % directory)

        mets = OcrdMets(content=METS_XML_EMPTY)

        if not os.path.exists(directory):
            os.makedirs(directory)
        directory = os.path.abspath(directory)

        self.add_files_to_mets(convention, mets, directory)
        if return_mets:
            return mets

        #  print(mets.to_xml(xmllint=True).decode('utf-8'))
        mets_fpath = os.path.join(directory, 'mets.xml')
        with open(mets_fpath, 'wb') as fmets:
            log.info("Writing %s", mets_fpath)
            fmets.write(mets.to_xml(xmllint=True))

        return Workspace(self, directory, mets)
Esempio n. 2
0
 def test_unique_identifier_from_nothing(self):
     mets = OcrdMets.empty_mets()
     self.assertEqual(mets.unique_identifier, None, 'no identifier')
     mets.unique_identifier = 'foo'
     self.assertEqual(mets.unique_identifier, 'foo',
                      'Right identifier after change')
     as_string = mets.to_xml().decode('utf-8')
     self.assertIn('ocrd/core v%s' % VERSION, as_string)
     self.assertIn('CREATEDATE="2018-', as_string)
Esempio n. 3
0
    def workspace_from_nothing(self, directory, mets_basename='mets.xml', clobber_mets=False):
        """
        Create an empty workspace.
        """
        if directory is None:
            directory = tempfile.mkdtemp(prefix=TMP_PREFIX)
        if not os.path.exists(directory):
            os.makedirs(directory)

        mets_fpath = os.path.join(directory, mets_basename)
        if not clobber_mets and os.path.exists(mets_fpath):
            raise Exception("Not clobbering existing mets.xml in '%s'." % directory)
        mets = OcrdMets(content=METS_XML_EMPTY)
        with open(mets_fpath, 'wb') as fmets:
            log.info("Writing %s", mets_fpath)
            fmets.write(mets.to_xml(xmllint=True))

        return Workspace(self, directory, mets)
Esempio n. 4
0
 def __init__(self, resolver, directory, mets=None):
     self.resolver = resolver
     self.directory = directory
     self.mets_filename = os.path.join(directory, 'mets.xml')
     if mets is None:
         mets = OcrdMets(filename=self.mets_filename)
     self.mets = mets
     #  print(mets.to_xml(xmllint=True).decode('utf-8'))
     self.image_cache = {
         'pil': {},
         'cv2': {},
         'exif': {},
     }
Esempio n. 5
0
 def setUp(self):
     self.mets = OcrdMets(
         filename=assets.url_of('SBB0000F29300010000/mets.xml'))
Esempio n. 6
0
class TestOcrdMets(TestCase):
    def setUp(self):
        self.mets = OcrdMets(
            filename=assets.url_of('SBB0000F29300010000/mets.xml'))

    def test_unique_identifier(self):
        self.assertEqual(
            self.mets.unique_identifier,
            'http://resolver.staatsbibliothek-berlin.de/SBB0000F29300010000',
            'Right identifier')
        self.mets.unique_identifier = 'foo'
        self.assertEqual(self.mets.unique_identifier, 'foo',
                         'Right identifier after change')

    def test_unique_identifier_from_nothing(self):
        mets = OcrdMets.empty_mets()
        self.assertEqual(mets.unique_identifier, None, 'no identifier')
        mets.unique_identifier = 'foo'
        self.assertEqual(mets.unique_identifier, 'foo',
                         'Right identifier after change')
        as_string = mets.to_xml().decode('utf-8')
        self.assertIn('ocrd/core v%s' % VERSION, as_string)
        self.assertIn('CREATEDATE="2018-', as_string)

    def test_file_groups(self):
        self.assertEqual(len(self.mets.file_groups), 17, '17 file groups')

    def test_find_files(self):
        self.assertEqual(len(self.mets.find_files(fileGrp='OCR-D-IMG')), 2,
                         '2 files in "OCR-D-IMG"')
        self.assertEqual(len(self.mets.find_files(groupId='FILE_0001_IMAGE')),
                         17, '17 files with GROUPID "FILE_0001_IMAGE"')
        self.assertEqual(len(self.mets.find_files(mimetype='image/tiff')), 12,
                         '12 image/tiff')
        self.assertEqual(len(self.mets.find_files(mimetype=MIMETYPE_PAGE)), 20,
                         '20 ' + MIMETYPE_PAGE)
        self.assertEqual(len(self.mets.find_files()), 34, '34 files total')

    def test_add_group(self):
        self.assertEqual(len(self.mets.file_groups), 17, '17 file groups')
        self.mets.add_file_group('TEST')
        self.assertEqual(len(self.mets.file_groups), 18, '18 file groups')

    def test_add_file(self):
        self.assertEqual(len(self.mets.file_groups), 17, '17 file groups')
        self.assertEqual(len(self.mets.find_files(fileGrp='OUTPUT')), 0,
                         '0 files in "OUTPUT"')
        f = self.mets.add_file('OUTPUT', mimetype="bla/quux", groupId="foobar")
        self.assertEqual(f.groupId, 'foobar', 'GROUPID set')
        self.assertEqual(len(self.mets.file_groups), 18, '18 file groups')
        self.assertEqual(len(self.mets.find_files(fileGrp='OUTPUT')), 1,
                         '1 files in "OUTPUT"')

    def test_add_file_no_groupid(self):
        f = self.mets.add_file('OUTPUT', mimetype="bla/quux")
        self.assertEqual(f.groupId, None, 'No GROUPID')

    def test_add_file_ID_fail(self):
        f = self.mets.add_file('OUTPUT',
                               ID='best-id-ever',
                               mimetype="beep/boop")
        self.assertEqual(f.ID, 'best-id-ever', "ID kept")
        with self.assertRaises(Exception) as cm:
            self.mets.add_file('OUTPUT',
                               ID='best-id-ever',
                               mimetype="boop/beep")
        self.assertEqual(str(cm.exception),
                         "File with ID='best-id-ever' already exists")
        f2 = self.mets.add_file('OUTPUT',
                                ID='best-id-ever',
                                mimetype="boop/beep",
                                force=True)
        self.assertEqual(f._el, f2._el)

    def test_filegrp_from_file(self):
        f = self.mets.find_files(fileGrp='OCR-D-IMG')[0]
        self.assertEqual(f.fileGrp, 'OCR-D-IMG')

    def test_file_groupid(self):
        f = self.mets.find_files()[0]
        self.assertEqual(f.groupId, 'FILE_0001_IMAGE')
        f.groupId = 'foo'
        self.assertEqual(f.groupId, 'foo')
Esempio n. 7
0
 def reload_mets(self):
     """
     Reload METS from disk.
     """
     self.mets = OcrdMets(filename=self.mets_target)
Esempio n. 8
0
class Workspace(object):
    """
    A workspace is a temporary directory set up for a processor. It's the
    interface to the METS/PAGE XML and delegates download and upload to the
    Resolver.

    Args:

        directory (string) : Folder to work in
        mets (:class:`OcrdMets`) : OcrdMets representing this workspace. Loaded from 'mets.xml' if ``None``.
        mets_basename (string) : Basename of the METS XML file. Default: Last URL segment of the mets_url.
    """
    def __init__(self,
                 resolver,
                 directory,
                 mets=None,
                 mets_basename='mets.xml'):
        self.resolver = resolver
        self.directory = directory
        self.mets_target = os.path.join(directory, mets_basename)
        if mets is None:
            mets = OcrdMets(filename=self.mets_target)
        self.mets = mets
        #  print(mets.to_xml(xmllint=True).decode('utf-8'))
        self.image_cache = {
            'pil': {},
            'cv2': {},
            'exif': {},
        }

    def __str__(self):
        return 'Workspace[directory=%s, file_groups=%s, files=%s]' % (
            self.directory,
            self.mets.file_groups,
            [str(f) for f in self.mets.find_files()],
        )

    def reload_mets(self):
        """
        Reload METS from disk.
        """
        self.mets = OcrdMets(filename=self.mets_target)

    def download_url(self, url, **kwargs):
        """
        Download a URL to the workspace.

        Args:
            url (string): URL to download to directory
            **kwargs : See :py:mod:`ocrd.resolver.Resolver`

        Returns:
            The local filename of the downloaded file
        """
        os.chdir(self.directory)
        return self.resolver.download_to_directory(self.directory, url,
                                                   **kwargs)

    def download_file(self, f, **kwargs):
        """
        Download a :py:mod:`ocrd.model.ocrd_file.OcrdFile` to the workspace.
        """
        os.chdir(self.directory)
        if f.local_filename:
            log.debug("Already downloaded: %s", f.local_filename)
        else:
            f.local_filename = self.download_url(f.url, **kwargs)
        f.url = 'file://' + f.local_filename
        return f

    def download_files_in_group(self, file_grp):
        """
        Download all  the :py:mod:`ocrd.model.ocrd_file.OcrdFile` in the file group given.
        """
        for input_file in self.mets.find_files(fileGrp=file_grp):
            self.download_file(input_file, subdir=file_grp)

    def add_file(self,
                 file_grp,
                 basename=None,
                 content=None,
                 local_filename=None,
                 **kwargs):
        """
        Add an output file. Creates an :class:`OcrdFile` to pass around and adds that to the
        OcrdMets OUTPUT section.
        """
        log.debug(
            'outputfile file_grp=%s basename=%s local_filename=%s content=%s',
            file_grp, basename, local_filename, content is not None)
        if basename is not None:
            if file_grp is not None:
                basename = os.path.join(file_grp, basename)
            local_filename = os.path.join(self.directory, basename)

        local_filename_dir = local_filename.rsplit('/', 1)[0]
        if not os.path.isdir(local_filename_dir):
            os.makedirs(local_filename_dir)

        if 'url' not in kwargs:
            kwargs['url'] = 'file://' + local_filename

        ret = self.mets.add_file(file_grp,
                                 local_filename=local_filename,
                                 **kwargs)

        if content is not None:
            with open(local_filename, 'wb') as f:
                if sys.version_info >= (3, 0) and isinstance(content, str):
                    content = bytes(content, 'utf-8')
                f.write(content)

        return ret

    def move_file(self, fobj, dst):
        """
        Move a fobj within the workspace
        """
        shutil.move(fobj.local_filename, os.path.join(self.directory, dst))

    def persist(self):
        """
        Persist the workspace using the resolver. Uploads the files in the
        OUTPUT group to the data repository, sets their URL accordingly.
        """
        self.save_mets()
        raise Exception("NIH")

    def save_mets(self):
        """
        Write out the current state of the METS file.
        """
        with open(self.mets_target, 'wb') as f:
            f.write(self.mets.to_xml(xmllint=True))

    def resolve_image_exif(self, image_url):
        """
        Get the EXIF metadata about an image URL as :class:`OcrdExif`

        Args:
            image_url (string) : URL of image

        Return
            :class:`OcrdExif`
        """
        image_filename = self.download_url(image_url)

        if image_url not in self.image_cache['exif']:
            self.image_cache['exif'][image_url] = OcrdExif.from_filename(
                image_filename)
        return self.image_cache['exif'][image_url]

    def resolve_image_as_pil(self, image_url, coords=None):
        """
        Resolve an image URL to a PIL image.

        Args:
            coords (list) : Coordinates of the bounding box to cut from the image

        Returns:
            Image or region in image as PIL.Image
        """
        image_filename = self.download_url(image_url)

        if image_url not in self.image_cache['pil']:
            self.image_cache['pil'][image_url] = Image.open(image_filename)

        pil_image = self.image_cache['pil'][image_url]

        if coords is None:
            return pil_image
        else:
            if image_url not in self.image_cache['cv2']:
                self.image_cache['cv2'][image_url] = cv2.cvtColor(
                    np.array(pil_image), cv2.COLOR_RGB2BGR)
            cv2_image = self.image_cache['cv2'][image_url]
            poly = np.array(coords, np.int32)
            region_cut = cv2_image[np.min(poly[:, 1]):np.max(poly[:, 1]),
                                   np.min(poly[:, 0]):np.max(poly[:, 0])]
            return Image.fromarray(region_cut)
Esempio n. 9
0
class TestOcrdMets(TestCase):
    def setUp(self):
        self.mets = OcrdMets(
            filename=assets.url_of('SBB0000F29300010000/mets.xml'))

    def test_unique_identifier(self):
        self.assertEqual(
            self.mets.unique_identifier,
            'http://resolver.staatsbibliothek-berlin.de/SBB0000F29300010000',
            'Right identifier')

    def test_file_groups(self):
        self.assertEqual(len(self.mets.file_groups), 17, '17 file groups')

    def test_find_files(self):
        self.assertEqual(len(self.mets.find_files(fileGrp='OCR-D-IMG')), 2,
                         '2 files in "OCR-D-IMG"')
        self.assertEqual(len(self.mets.find_files(groupId='FILE_0001_IMAGE')),
                         17, '17 files with GROUPID "FILE_0001_IMAGE"')
        self.assertEqual(len(self.mets.find_files(mimetype='image/tif')), 12,
                         '12 image/tif')
        self.assertEqual(len(self.mets.find_files(mimetype=MIMETYPE_PAGE)), 20,
                         '20 ' + MIMETYPE_PAGE)
        self.assertEqual(len(self.mets.find_files()), 34, '34 files total')

    def test_add_group(self):
        self.assertEqual(len(self.mets.file_groups), 17, '17 file groups')
        self.mets.add_file_group('TEST')
        self.assertEqual(len(self.mets.file_groups), 18, '18 file groups')

    def test_add_file(self):
        self.assertEqual(len(self.mets.file_groups), 17, '17 file groups')
        self.assertEqual(len(self.mets.find_files(fileGrp='OUTPUT')), 0,
                         '0 files in "OUTPUT"')
        self.mets.add_file('OUTPUT', mimetype="bla/quux")
        self.assertEqual(len(self.mets.file_groups), 18, '18 file groups')
        self.assertEqual(len(self.mets.find_files(fileGrp='OUTPUT')), 1,
                         '1 files in "OUTPUT"')

    def test_file_groupid(self):
        f = self.mets.find_files()[0]
        self.assertEqual(f.groupId, 'FILE_0001_IMAGE')
        f.groupId = 'foo'
        self.assertEqual(f.groupId, 'foo')
Esempio n. 10
0
#!/usr/bin/env python

from sys import argv
from os.path import isfile
from ocrd.model import OcrdMets

fname = argv[1]
if not isfile(fname):
    raise "File not found %s" % fname
mets = OcrdMets(filename=fname)

# pylint: disable=protected-access
for f in mets.find_files():
    if not f.pageId:
        groupid = f._el.get('GROUPID')
        if groupid:
            del f._el.attrib['GROUPID']
        else:
            groupid = "FIXME"
            print(
                "!! File %s has neither GROUPID nor mets:fptr in the PHYSICAL structMap"
                % f.url)
        print("Setting page of %s to %s" % (f.ID, groupid))
        f.pageId = groupid

with open(fname, 'wb') as out:
    out.write(mets.to_xml(xmllint=True))