def workspace_from_folder(self, directory, return_mets=False, clobber_mets=False, convention='ocrd-gt'): """ Create a workspace from a folder, creating a METS file. Args: convention: See add_files_to_mets clobber_mets (boolean) : Whether to overwrite existing mets.xml. Default: False. return_mets (boolean) : Do not create the actual mets.xml file but return the :class:`OcrdMets`. Default: False. """ if directory is None: raise Exception("Must pass directory") if not os.path.isdir(directory): raise Exception("Directory does not exist or is not a directory: '%s'" % directory) if not clobber_mets and os.path.exists(os.path.join(directory, 'mets.xml')): raise Exception("Not clobbering existing mets.xml in '%s'." % directory) mets = OcrdMets(content=METS_XML_EMPTY) if not os.path.exists(directory): os.makedirs(directory) directory = os.path.abspath(directory) self.add_files_to_mets(convention, mets, directory) if return_mets: return mets # print(mets.to_xml(xmllint=True).decode('utf-8')) mets_fpath = os.path.join(directory, 'mets.xml') with open(mets_fpath, 'wb') as fmets: log.info("Writing %s", mets_fpath) fmets.write(mets.to_xml(xmllint=True)) return Workspace(self, directory, mets)
def test_unique_identifier_from_nothing(self): mets = OcrdMets.empty_mets() self.assertEqual(mets.unique_identifier, None, 'no identifier') mets.unique_identifier = 'foo' self.assertEqual(mets.unique_identifier, 'foo', 'Right identifier after change') as_string = mets.to_xml().decode('utf-8') self.assertIn('ocrd/core v%s' % VERSION, as_string) self.assertIn('CREATEDATE="2018-', as_string)
def workspace_from_nothing(self, directory, mets_basename='mets.xml', clobber_mets=False): """ Create an empty workspace. """ if directory is None: directory = tempfile.mkdtemp(prefix=TMP_PREFIX) if not os.path.exists(directory): os.makedirs(directory) mets_fpath = os.path.join(directory, mets_basename) if not clobber_mets and os.path.exists(mets_fpath): raise Exception("Not clobbering existing mets.xml in '%s'." % directory) mets = OcrdMets(content=METS_XML_EMPTY) with open(mets_fpath, 'wb') as fmets: log.info("Writing %s", mets_fpath) fmets.write(mets.to_xml(xmllint=True)) return Workspace(self, directory, mets)
def __init__(self, resolver, directory, mets=None): self.resolver = resolver self.directory = directory self.mets_filename = os.path.join(directory, 'mets.xml') if mets is None: mets = OcrdMets(filename=self.mets_filename) self.mets = mets # print(mets.to_xml(xmllint=True).decode('utf-8')) self.image_cache = { 'pil': {}, 'cv2': {}, 'exif': {}, }
def setUp(self): self.mets = OcrdMets( filename=assets.url_of('SBB0000F29300010000/mets.xml'))
class TestOcrdMets(TestCase): def setUp(self): self.mets = OcrdMets( filename=assets.url_of('SBB0000F29300010000/mets.xml')) def test_unique_identifier(self): self.assertEqual( self.mets.unique_identifier, 'http://resolver.staatsbibliothek-berlin.de/SBB0000F29300010000', 'Right identifier') self.mets.unique_identifier = 'foo' self.assertEqual(self.mets.unique_identifier, 'foo', 'Right identifier after change') def test_unique_identifier_from_nothing(self): mets = OcrdMets.empty_mets() self.assertEqual(mets.unique_identifier, None, 'no identifier') mets.unique_identifier = 'foo' self.assertEqual(mets.unique_identifier, 'foo', 'Right identifier after change') as_string = mets.to_xml().decode('utf-8') self.assertIn('ocrd/core v%s' % VERSION, as_string) self.assertIn('CREATEDATE="2018-', as_string) def test_file_groups(self): self.assertEqual(len(self.mets.file_groups), 17, '17 file groups') def test_find_files(self): self.assertEqual(len(self.mets.find_files(fileGrp='OCR-D-IMG')), 2, '2 files in "OCR-D-IMG"') self.assertEqual(len(self.mets.find_files(groupId='FILE_0001_IMAGE')), 17, '17 files with GROUPID "FILE_0001_IMAGE"') self.assertEqual(len(self.mets.find_files(mimetype='image/tiff')), 12, '12 image/tiff') self.assertEqual(len(self.mets.find_files(mimetype=MIMETYPE_PAGE)), 20, '20 ' + MIMETYPE_PAGE) self.assertEqual(len(self.mets.find_files()), 34, '34 files total') def test_add_group(self): self.assertEqual(len(self.mets.file_groups), 17, '17 file groups') self.mets.add_file_group('TEST') self.assertEqual(len(self.mets.file_groups), 18, '18 file groups') def test_add_file(self): self.assertEqual(len(self.mets.file_groups), 17, '17 file groups') self.assertEqual(len(self.mets.find_files(fileGrp='OUTPUT')), 0, '0 files in "OUTPUT"') f = self.mets.add_file('OUTPUT', mimetype="bla/quux", groupId="foobar") self.assertEqual(f.groupId, 'foobar', 'GROUPID set') self.assertEqual(len(self.mets.file_groups), 18, '18 file groups') self.assertEqual(len(self.mets.find_files(fileGrp='OUTPUT')), 1, '1 files in "OUTPUT"') def test_add_file_no_groupid(self): f = self.mets.add_file('OUTPUT', mimetype="bla/quux") self.assertEqual(f.groupId, None, 'No GROUPID') def test_add_file_ID_fail(self): f = self.mets.add_file('OUTPUT', ID='best-id-ever', mimetype="beep/boop") self.assertEqual(f.ID, 'best-id-ever', "ID kept") with self.assertRaises(Exception) as cm: self.mets.add_file('OUTPUT', ID='best-id-ever', mimetype="boop/beep") self.assertEqual(str(cm.exception), "File with ID='best-id-ever' already exists") f2 = self.mets.add_file('OUTPUT', ID='best-id-ever', mimetype="boop/beep", force=True) self.assertEqual(f._el, f2._el) def test_filegrp_from_file(self): f = self.mets.find_files(fileGrp='OCR-D-IMG')[0] self.assertEqual(f.fileGrp, 'OCR-D-IMG') def test_file_groupid(self): f = self.mets.find_files()[0] self.assertEqual(f.groupId, 'FILE_0001_IMAGE') f.groupId = 'foo' self.assertEqual(f.groupId, 'foo')
def reload_mets(self): """ Reload METS from disk. """ self.mets = OcrdMets(filename=self.mets_target)
class Workspace(object): """ A workspace is a temporary directory set up for a processor. It's the interface to the METS/PAGE XML and delegates download and upload to the Resolver. Args: directory (string) : Folder to work in mets (:class:`OcrdMets`) : OcrdMets representing this workspace. Loaded from 'mets.xml' if ``None``. mets_basename (string) : Basename of the METS XML file. Default: Last URL segment of the mets_url. """ def __init__(self, resolver, directory, mets=None, mets_basename='mets.xml'): self.resolver = resolver self.directory = directory self.mets_target = os.path.join(directory, mets_basename) if mets is None: mets = OcrdMets(filename=self.mets_target) self.mets = mets # print(mets.to_xml(xmllint=True).decode('utf-8')) self.image_cache = { 'pil': {}, 'cv2': {}, 'exif': {}, } def __str__(self): return 'Workspace[directory=%s, file_groups=%s, files=%s]' % ( self.directory, self.mets.file_groups, [str(f) for f in self.mets.find_files()], ) def reload_mets(self): """ Reload METS from disk. """ self.mets = OcrdMets(filename=self.mets_target) def download_url(self, url, **kwargs): """ Download a URL to the workspace. Args: url (string): URL to download to directory **kwargs : See :py:mod:`ocrd.resolver.Resolver` Returns: The local filename of the downloaded file """ os.chdir(self.directory) return self.resolver.download_to_directory(self.directory, url, **kwargs) def download_file(self, f, **kwargs): """ Download a :py:mod:`ocrd.model.ocrd_file.OcrdFile` to the workspace. """ os.chdir(self.directory) if f.local_filename: log.debug("Already downloaded: %s", f.local_filename) else: f.local_filename = self.download_url(f.url, **kwargs) f.url = 'file://' + f.local_filename return f def download_files_in_group(self, file_grp): """ Download all the :py:mod:`ocrd.model.ocrd_file.OcrdFile` in the file group given. """ for input_file in self.mets.find_files(fileGrp=file_grp): self.download_file(input_file, subdir=file_grp) def add_file(self, file_grp, basename=None, content=None, local_filename=None, **kwargs): """ Add an output file. Creates an :class:`OcrdFile` to pass around and adds that to the OcrdMets OUTPUT section. """ log.debug( 'outputfile file_grp=%s basename=%s local_filename=%s content=%s', file_grp, basename, local_filename, content is not None) if basename is not None: if file_grp is not None: basename = os.path.join(file_grp, basename) local_filename = os.path.join(self.directory, basename) local_filename_dir = local_filename.rsplit('/', 1)[0] if not os.path.isdir(local_filename_dir): os.makedirs(local_filename_dir) if 'url' not in kwargs: kwargs['url'] = 'file://' + local_filename ret = self.mets.add_file(file_grp, local_filename=local_filename, **kwargs) if content is not None: with open(local_filename, 'wb') as f: if sys.version_info >= (3, 0) and isinstance(content, str): content = bytes(content, 'utf-8') f.write(content) return ret def move_file(self, fobj, dst): """ Move a fobj within the workspace """ shutil.move(fobj.local_filename, os.path.join(self.directory, dst)) def persist(self): """ Persist the workspace using the resolver. Uploads the files in the OUTPUT group to the data repository, sets their URL accordingly. """ self.save_mets() raise Exception("NIH") def save_mets(self): """ Write out the current state of the METS file. """ with open(self.mets_target, 'wb') as f: f.write(self.mets.to_xml(xmllint=True)) def resolve_image_exif(self, image_url): """ Get the EXIF metadata about an image URL as :class:`OcrdExif` Args: image_url (string) : URL of image Return :class:`OcrdExif` """ image_filename = self.download_url(image_url) if image_url not in self.image_cache['exif']: self.image_cache['exif'][image_url] = OcrdExif.from_filename( image_filename) return self.image_cache['exif'][image_url] def resolve_image_as_pil(self, image_url, coords=None): """ Resolve an image URL to a PIL image. Args: coords (list) : Coordinates of the bounding box to cut from the image Returns: Image or region in image as PIL.Image """ image_filename = self.download_url(image_url) if image_url not in self.image_cache['pil']: self.image_cache['pil'][image_url] = Image.open(image_filename) pil_image = self.image_cache['pil'][image_url] if coords is None: return pil_image else: if image_url not in self.image_cache['cv2']: self.image_cache['cv2'][image_url] = cv2.cvtColor( np.array(pil_image), cv2.COLOR_RGB2BGR) cv2_image = self.image_cache['cv2'][image_url] poly = np.array(coords, np.int32) region_cut = cv2_image[np.min(poly[:, 1]):np.max(poly[:, 1]), np.min(poly[:, 0]):np.max(poly[:, 0])] return Image.fromarray(region_cut)
class TestOcrdMets(TestCase): def setUp(self): self.mets = OcrdMets( filename=assets.url_of('SBB0000F29300010000/mets.xml')) def test_unique_identifier(self): self.assertEqual( self.mets.unique_identifier, 'http://resolver.staatsbibliothek-berlin.de/SBB0000F29300010000', 'Right identifier') def test_file_groups(self): self.assertEqual(len(self.mets.file_groups), 17, '17 file groups') def test_find_files(self): self.assertEqual(len(self.mets.find_files(fileGrp='OCR-D-IMG')), 2, '2 files in "OCR-D-IMG"') self.assertEqual(len(self.mets.find_files(groupId='FILE_0001_IMAGE')), 17, '17 files with GROUPID "FILE_0001_IMAGE"') self.assertEqual(len(self.mets.find_files(mimetype='image/tif')), 12, '12 image/tif') self.assertEqual(len(self.mets.find_files(mimetype=MIMETYPE_PAGE)), 20, '20 ' + MIMETYPE_PAGE) self.assertEqual(len(self.mets.find_files()), 34, '34 files total') def test_add_group(self): self.assertEqual(len(self.mets.file_groups), 17, '17 file groups') self.mets.add_file_group('TEST') self.assertEqual(len(self.mets.file_groups), 18, '18 file groups') def test_add_file(self): self.assertEqual(len(self.mets.file_groups), 17, '17 file groups') self.assertEqual(len(self.mets.find_files(fileGrp='OUTPUT')), 0, '0 files in "OUTPUT"') self.mets.add_file('OUTPUT', mimetype="bla/quux") self.assertEqual(len(self.mets.file_groups), 18, '18 file groups') self.assertEqual(len(self.mets.find_files(fileGrp='OUTPUT')), 1, '1 files in "OUTPUT"') def test_file_groupid(self): f = self.mets.find_files()[0] self.assertEqual(f.groupId, 'FILE_0001_IMAGE') f.groupId = 'foo' self.assertEqual(f.groupId, 'foo')
#!/usr/bin/env python from sys import argv from os.path import isfile from ocrd.model import OcrdMets fname = argv[1] if not isfile(fname): raise "File not found %s" % fname mets = OcrdMets(filename=fname) # pylint: disable=protected-access for f in mets.find_files(): if not f.pageId: groupid = f._el.get('GROUPID') if groupid: del f._el.attrib['GROUPID'] else: groupid = "FIXME" print( "!! File %s has neither GROUPID nor mets:fptr in the PHYSICAL structMap" % f.url) print("Setting page of %s to %s" % (f.ID, groupid)) f.pageId = groupid with open(fname, 'wb') as out: out.write(mets.to_xml(xmllint=True))