def test_encoding_entities(): mets = OcrdMets(content=""" <mets> <metsHdr> <agent> <name>Őh śéé Áŕ</name> <note>OCR-D</note> </agent> </metsHdr> </mets> """) assert 'Őh śéé Áŕ' in mets.to_xml().decode('utf-8')
def test_add_file_ignore(sbb_sample_01: OcrdMets): """Behavior if ignore-Flag set to true: delegate responsibility to overwrite existing files to user""" the_file = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="beep/boop") assert the_file.ID == 'best-id-ever' the_same = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="boop/beep", ignore=True) assert the_same.ID == 'best-id-ever' # how many files inserted the_files = list(sbb_sample_01.find_files(ID='best-id-ever')) assert len(the_files) == 1
def workspace_from_nothing(self, directory, mets_basename='mets.xml', clobber_mets=False): """ Create an empty workspace. Arguments: directory (string): Target directory for the workspace. \ If ``none``, create a temporary directory under :py:data:`ocrd.constants.TMP_PREFIX`. \ (The resulting path can be retrieved via :py:attr:`ocrd.Workspace.directory`.) Keyword Arguments: clobber_mets (boolean, False): Whether to overwrite existing ``mets.xml``. \ By default existing ``mets.xml`` will raise an exception. Returns: a new :py:class:`~ocrd.workspace.Workspace` """ log = getLogger('ocrd.resolver.workspace_from_nothing') if directory is None: directory = mkdtemp(prefix=TMP_PREFIX) Path(directory).mkdir(parents=True, exist_ok=True) mets_path = Path(directory, mets_basename) if mets_path.exists() and not clobber_mets: raise FileExistsError( "METS '%s' already exists in '%s' and clobber_mets not set." % (mets_basename, directory)) mets = OcrdMets.empty_mets() log.info("Writing METS to %s", mets_path) mets_path.write_bytes(mets.to_xml(xmllint=True)) return Workspace(self, directory, mets, mets_basename=mets_basename)
def test_add_group(): mets = OcrdMets.empty_mets() assert len(mets.file_groups) == 0, '0 file groups' mets.add_file_group('TEST') assert len(mets.file_groups) == 1, '1 file groups' mets.add_file_group('TEST') assert len(mets.file_groups) == 1, '1 file groups'
def test_add_group(self): mets = OcrdMets.empty_mets() self.assertEqual(len(mets.file_groups), 0, '0 file groups') mets.add_file_group('TEST') self.assertEqual(len(mets.file_groups), 1, '1 file groups') mets.add_file_group('TEST') self.assertEqual(len(mets.file_groups), 1, '1 file groups')
def test_add_file(self): mets = OcrdMets.empty_mets() self.assertEqual(len(mets.file_groups), 0, '0 file groups') self.assertEqual(len(mets.find_files(fileGrp='OUTPUT')), 0, '0 files in "OUTPUT"') f = mets.add_file('OUTPUT', ID="foo123", mimetype="bla/quux", pageId="foobar") f2 = mets.add_file('OUTPUT', ID="foo1232", mimetype="bla/quux", pageId="foobar") self.assertEqual(f.pageId, 'foobar', 'pageId set') self.assertEqual(len(mets.file_groups), 1, '1 file groups') self.assertEqual(len(mets.find_files(fileGrp='OUTPUT')), 2, '2 files in "OUTPUT"') mets.set_physical_page_for_file('barfoo', f, order='300', orderlabel="page 300") self.assertEqual(f.pageId, 'barfoo', 'pageId changed') mets.set_physical_page_for_file('quux', f2, order='302', orderlabel="page 302") self.assertEqual(f2.pageId, 'quux', 'pageId changed') mets.set_physical_page_for_file('barfoo', f2, order='301', orderlabel="page 301") self.assertEqual(f2.pageId, 'barfoo', 'pageId changed') self.assertEqual(len(mets.file_groups), 1, '1 file group')
def test_unique_identifier_from_nothing(): mets = OcrdMets.empty_mets(datetime.now().isoformat()) assert mets.unique_identifier == None, 'no identifier' mets.unique_identifier = 'foo' assert mets.unique_identifier == 'foo', 'Right identifier after change is "foo"' as_string = mets.to_xml().decode('utf-8') assert 'ocrd/core v%s' % VERSION in as_string assert 'CREATEDATE="%04u-%02u-%02uT' % (datetime.now().year, datetime.now().month, datetime.now().day,) in as_string
def test_page_from_file_no_existe(self): with self.assertRaisesRegex(FileNotFoundError, "File not found: 'no-existe'"): mets = OcrdMets.empty_mets() ocrd_file = mets.add_file('FOO', ID='foo', local_filename='no-existe', mimetype='foo/bar') page_from_file(ocrd_file)
def test_fptr_changed_for_change_id(): mets = OcrdMets.empty_mets() f1 = mets.add_file('FOO', ID='FOO_1', mimetype='image/tiff', pageId='p0001') assert mets.get_physical_pages(for_fileIds=['FOO_1']) == ['p0001'] f1.ID = 'BAZ_1' assert mets.get_physical_pages(for_fileIds=['FOO_1']) == [None] assert mets.get_physical_pages(for_fileIds=['BAZ_1']) == ['p0001']
def __init__(self, resolver, directory, mets=None, mets_basename='mets.xml', automatic_backup=False, baseurl=None): self.resolver = resolver self.directory = directory self.mets_target = str(Path(directory, mets_basename)) self.overwrite_mode = False if mets is None: mets = OcrdMets(filename=self.mets_target) self.mets = mets self.automatic_backup = automatic_backup self.baseurl = baseurl
def test_ocrd_file_equality(): mets = OcrdMets.empty_mets() f1 = mets.add_file('FOO', ID='FOO_1', mimetype='image/tiff') f2 = mets.add_file('FOO', ID='FOO_2', mimetype='image/tiff') assert f1 != f2 f3 = create_ocrd_file_with_defaults(ID='TEMP_1', mimetype='image/tiff') f4 = create_ocrd_file_with_defaults(ID='TEMP_1', mimetype='image/tif') # be tolerant of different equivalent mimetypes assert f3 == f4 f5 = mets.add_file('TEMP', ID='TEMP_1', mimetype='image/tiff') assert f3 == f5
def test_unique_identifier_from_nothing(self): mets = OcrdMets.empty_mets(datetime.now().isoformat()) self.assertEqual(mets.unique_identifier, None, 'no identifier') mets.unique_identifier = 'foo' self.assertEqual(mets.unique_identifier, 'foo', 'Right identifier after change') as_string = mets.to_xml().decode('utf-8') self.assertIn('ocrd/core v%s' % VERSION, as_string) self.assertIn('CREATEDATE="%04u-%02u-%02uT' % ( datetime.now().year, datetime.now().month, datetime.now().day, ), as_string)
def read_from_mets(metsfile, filegrp, page_ids, outputfile, pagelabel='pageId', overwrite=False): overwrite = overwrite == 'true' mets = OcrdMets(filename=metsfile) inputfiles = [] pagelabels = [] metadata = get_metadata(mets) for f in mets.find_files(mimetype='application/pdf', fileGrp=filegrp, pageId=(page_ids or None)): # ignore multipaged pdfs if f.pageId: inputfiles.append(f.local_filename) if pagelabel != "pagenumber": pagelabels.append(getattr(f, pagelabel, "")) log = getLogger('processor.pagetopdf') if not inputfiles: log.warning("No PDF input files for merging %s", outputfile) return None if pdfmerge(inputfiles, outputfile, pagelabels=pagelabels, metadata=metadata): mets.add_file(filegrp, mimetype='application/pdf', ID=outputfile, url=str(Path(filegrp).joinpath(outputfile + '.pdf')), force=overwrite) with atomic_write(metsfile, overwrite=True) as f: f.write(mets.to_xml(xmllint=True).decode('utf-8'))
def test_make_file_id_744(self): """ https://github.com/OCR-D/core/pull/744 > Often file IDs have two numbers, one of which will clash. In that case only the numerical fallback works. """ mets = OcrdMets.empty_mets() f = mets.add_file('GRP2', ID='img1796-97_00000024_img', pageId='phys0024') f = mets.add_file('GRP2', ID='img1796-97_00000025_img', pageId='phys0025') self.assertEqual(make_file_id(f, 'GRP2'), 'GRP2_0002')
def test_make_file_id_mets(self): mets = OcrdMets.empty_mets() for i in range(1, 10): mets.add_file('FOO', ID="FOO_%04d" % (i), mimetype="image/tiff") mets.add_file('BAR', ID="BAR_%04d" % (i), mimetype="image/tiff") self.assertEqual(make_file_id(mets.find_files(ID='BAR_0007')[0], 'FOO'), 'FOO_0007') f = mets.add_file('ABC', ID="BAR_7", mimetype="image/tiff") self.assertEqual(make_file_id(f, 'FOO'), 'FOO_0010') mets.remove_file(fileGrp='FOO') self.assertEqual(make_file_id(f, 'FOO'), 'FOO_0001') mets.add_file('FOO', ID="FOO_0001", mimetype="image/tiff") # print('\n'.join(['%s' % of for of in mets.find_files()])) self.assertEqual(make_file_id(f, 'FOO'), 'FOO_0002')
def test_ocrd_file_eq(self): mets = OcrdMets.empty_mets() f1 = mets.add_file('FOO', ID='FOO_1', mimetype='image/tiff') self.assertEqual(f1 == f1, True) self.assertEqual(f1 != f1, False) f2 = mets.add_file('FOO', ID='FOO_2', mimetype='image/tiff') self.assertEqual(f1 == f2, False) f3 = create_ocrd_file_with_defaults(ID='TEMP_1', mimetype='image/tiff') f4 = create_ocrd_file_with_defaults(ID='TEMP_1', mimetype='image/tif') # be tolerant of different equivalent mimetypes self.assertEqual(f3 == f4, True) f5 = mets.add_file('TEMP', ID='TEMP_1', mimetype='image/tiff') self.assertEqual(f3 == f5, True)
def test_remove_file_group0(self): """ Test removal of filegrp """ with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as tempdir: mets = OcrdMets(filename=join(tempdir, 'mets.xml')) self.assertEqual(len(mets.file_groups), 17) self.assertEqual(len(mets.find_all_files()), 35) # print() # before = sorted([x.ID for x in mets.find_all_files()]) with self.assertRaisesRegex(Exception, "not empty"): mets.remove_file_group('OCR-D-GT-ALTO') mets.remove_file_group('OCR-D-GT-PAGE', recursive=True) # print([x for x in before if x not in sorted([x.ID for x in mets.find_all_files()])]) self.assertEqual(len(mets.file_groups), 16) self.assertEqual(len(mets.find_all_files()), 33)
def test_rename_file_group0(self): with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as tempdir: mets = OcrdMets(filename=join(tempdir, 'mets.xml')) with self.assertRaisesRegex(FileNotFoundError, "No such fileGrp 'FOOBAR'"): mets.rename_file_group('FOOBAR', 'FOOBAR') assert 'FOOBAR' not in mets.file_groups mets.rename_file_group('OCR-D-GT-PAGE', 'FOOBAR') assert 'OCR-D-GT-PAGE' not in mets.file_groups assert 'FOOBAR' in mets.file_groups
def workspace_from_nothing(self, directory, mets_basename='mets.xml', clobber_mets=False): """ Create an empty workspace. """ if directory is None: directory = tempfile.mkdtemp(prefix=TMP_PREFIX) Path(directory).mkdir(parents=True, exist_ok=True) mets_path = Path(directory, mets_basename) if mets_path.exists() and not clobber_mets: raise FileExistsError("METS '%s' already exists in '%s' and clobber_mets not set." % (mets_basename, directory)) mets = OcrdMets.empty_mets() log.info("Writing METS to %s", mets_path) mets_path.write_bytes(mets.to_xml(xmllint=True)) return Workspace(self, directory, mets)
def test_add_file(): mets = OcrdMets.empty_mets() assert len(mets.file_groups) == 0, '0 file groups' assert len(list(mets.find_all_files(fileGrp='OUTPUT'))) == 0, '0 files in "OUTPUT"' f = mets.add_file('OUTPUT', ID="foo123", mimetype="bla/quux", pageId="foobar") f2 = mets.add_file('OUTPUT', ID="foo1232", mimetype="bla/quux", pageId="foobar") assert f.pageId == 'foobar', 'pageId set' assert len(mets.file_groups) == 1, '1 file groups' assert len(list(mets.find_all_files(fileGrp='OUTPUT'))) == 2, '2 files in "OUTPUT"' mets.set_physical_page_for_file('barfoo', f, order='300', orderlabel="page 300") assert f.pageId == 'barfoo', 'pageId changed' mets.set_physical_page_for_file('quux', f2, order='302', orderlabel="page 302") assert f2.pageId == 'quux', 'pageId changed' mets.set_physical_page_for_file('barfoo', f2, order='301', orderlabel="page 301") assert f2.pageId == 'barfoo', 'pageId changed' assert len(mets.file_groups) == 1, '1 file group'
def test_remove_file_group_regex(self): """ Test removal of filegrp """ with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as tempdir: mets = OcrdMets(filename=join(tempdir, 'mets.xml')) self.assertEqual(len(mets.file_groups), 17) self.assertEqual(len(mets.find_all_files()), 35) mets.remove_file_group('//OCR-D-GT-.*', recursive=True) self.assertEqual(len(mets.file_groups), 15) self.assertEqual(len(mets.find_all_files()), 31)
def __init__(self, resolver, directory, mets=None, mets_basename='mets.xml', automatic_backup=False, baseurl=None): self.resolver = resolver self.directory = directory self.mets_target = str(Path(directory, mets_basename)) if mets is None: mets = OcrdMets(filename=self.mets_target) self.mets = mets self.automatic_backup = automatic_backup self.baseurl = baseurl # print(mets.to_xml(xmllint=True).decode('utf-8')) self.image_cache = { 'pil': {}, 'cv2': {}, 'exif': {}, }
def workspace_from_nothing(self, directory, mets_basename='mets.xml', clobber_mets=False): """ Create an empty workspace. """ if directory is None: directory = tempfile.mkdtemp(prefix=TMP_PREFIX) if not exists(directory): makedirs(directory) mets_fpath = join(directory, mets_basename) if not clobber_mets and exists(mets_fpath): raise Exception("Not clobbering existing mets.xml in '%s'." % directory) mets = OcrdMets.empty_mets() with open(mets_fpath, 'wb') as fmets: log.info("Writing %s", mets_fpath) fmets.write(mets.to_xml(xmllint=True)) return Workspace(self, directory, mets)
def read_from_mets(metsfile, filegrp, outputfile, pagelabel='pageId'): mets = OcrdMets(filename=metsfile) inputfiles = [] pagelabels = [] metadata = get_metadata(mets) for f in mets.find_files(mimetype='application/pdf', fileGrp=filegrp): # ingore mulitpaged pdfs if f.pageId: inputfiles.append(f.local_filename) if pagelabel != "pagenumber": pagelabels.append(getattr(f, pagelabel, "")) if inputfiles: if not pdfmerge( inputfiles, outputfile, pagelabels=pagelabels, metadata=metadata): mets.add_file(filegrp, mimetype='application/pdf', ID=outputfile, url=str(Path(filegrp).joinpath(outputfile + '.pdf'))) with atomic_write(metsfile, overwrite=True) as f: f.write(mets.to_xml(xmllint=True).decode('utf-8')) return None
def setUp(self): super().setUp() self.mets = OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'))
def test_physical_pages_for_fileids(self): with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as tempdir: mets = OcrdMets(filename=join(tempdir, 'mets.xml')) self.assertEqual(mets.get_physical_pages(for_fileIds=['FILE_0002_IMAGE']), ['PHYS_0002'])
def test_physical_pages_from_empty_mets(self): mets = OcrdMets(content="<mets></mets>") self.assertEqual(len(mets.physical_pages), 0, 'no physical page') mets.add_file('OUTPUT', ID="foo123", pageId="foobar") self.assertEqual(len(mets.physical_pages), 1, '1 physical page')
def test_str(self): mets = OcrdMets(content='<mets/>') self.assertEqual(str(mets), 'OcrdMets[fileGrps=[],files=[]]')
def test_nocontent_nofilename(self): with self.assertRaisesRegex(Exception, "Must pass 'filename' or 'content' to"): OcrdMets()
def test_remove_file_regex(self): with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as tempdir: mets = OcrdMets(filename=join(tempdir, 'mets.xml')) self.assertEqual(mets.physical_pages, ['PHYS_0001', 'PHYS_0002', 'PHYS_0005']) mets.remove_file('//FILE_0005.*') self.assertEqual(mets.physical_pages, ['PHYS_0001', 'PHYS_0002'])