Ejemplo n.º 1
0
def test_encoding_entities():
    mets = OcrdMets(content="""
    <mets>
        <metsHdr>
        <agent>
            <name>Őh śéé Áŕ</name>
            <note>OCR-D</note>
        </agent>
        </metsHdr>
    </mets>
    """)
    assert 'Őh śéé Áŕ' in mets.to_xml().decode('utf-8')
Ejemplo n.º 2
0
def test_add_file_ignore(sbb_sample_01: OcrdMets):
    """Behavior if ignore-Flag set to true:
    delegate responsibility to overwrite existing files to user"""

    the_file = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="beep/boop")
    assert the_file.ID == 'best-id-ever'
    the_same = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="boop/beep", ignore=True)
    assert the_same.ID == 'best-id-ever'

    # how many files inserted
    the_files = list(sbb_sample_01.find_files(ID='best-id-ever'))
    assert len(the_files) == 1
Ejemplo n.º 3
0
    def workspace_from_nothing(self,
                               directory,
                               mets_basename='mets.xml',
                               clobber_mets=False):
        """
        Create an empty workspace.

        Arguments:
            directory (string): Target directory for the workspace. \
                If ``none``, create a temporary directory under :py:data:`ocrd.constants.TMP_PREFIX`. \
                (The resulting path can be retrieved via :py:attr:`ocrd.Workspace.directory`.)
        Keyword Arguments:
            clobber_mets (boolean, False): Whether to overwrite existing ``mets.xml``. \
                By default existing ``mets.xml`` will raise an exception.

        Returns:
            a new :py:class:`~ocrd.workspace.Workspace`
        """
        log = getLogger('ocrd.resolver.workspace_from_nothing')
        if directory is None:
            directory = mkdtemp(prefix=TMP_PREFIX)
        Path(directory).mkdir(parents=True, exist_ok=True)
        mets_path = Path(directory, mets_basename)
        if mets_path.exists() and not clobber_mets:
            raise FileExistsError(
                "METS '%s' already exists in '%s' and clobber_mets not set." %
                (mets_basename, directory))
        mets = OcrdMets.empty_mets()
        log.info("Writing METS to %s", mets_path)
        mets_path.write_bytes(mets.to_xml(xmllint=True))

        return Workspace(self, directory, mets, mets_basename=mets_basename)
Ejemplo n.º 4
0
def test_add_group():
    mets = OcrdMets.empty_mets()
    assert len(mets.file_groups) == 0, '0 file groups'
    mets.add_file_group('TEST')
    assert len(mets.file_groups) == 1, '1 file groups'
    mets.add_file_group('TEST')
    assert len(mets.file_groups) == 1, '1 file groups'
Ejemplo n.º 5
0
 def test_add_group(self):
     mets = OcrdMets.empty_mets()
     self.assertEqual(len(mets.file_groups), 0, '0 file groups')
     mets.add_file_group('TEST')
     self.assertEqual(len(mets.file_groups), 1, '1 file groups')
     mets.add_file_group('TEST')
     self.assertEqual(len(mets.file_groups), 1, '1 file groups')
Ejemplo n.º 6
0
 def test_add_file(self):
     mets = OcrdMets.empty_mets()
     self.assertEqual(len(mets.file_groups), 0, '0 file groups')
     self.assertEqual(len(mets.find_files(fileGrp='OUTPUT')), 0,
                      '0 files in "OUTPUT"')
     f = mets.add_file('OUTPUT',
                       ID="foo123",
                       mimetype="bla/quux",
                       pageId="foobar")
     f2 = mets.add_file('OUTPUT',
                        ID="foo1232",
                        mimetype="bla/quux",
                        pageId="foobar")
     self.assertEqual(f.pageId, 'foobar', 'pageId set')
     self.assertEqual(len(mets.file_groups), 1, '1 file groups')
     self.assertEqual(len(mets.find_files(fileGrp='OUTPUT')), 2,
                      '2 files in "OUTPUT"')
     mets.set_physical_page_for_file('barfoo',
                                     f,
                                     order='300',
                                     orderlabel="page 300")
     self.assertEqual(f.pageId, 'barfoo', 'pageId changed')
     mets.set_physical_page_for_file('quux',
                                     f2,
                                     order='302',
                                     orderlabel="page 302")
     self.assertEqual(f2.pageId, 'quux', 'pageId changed')
     mets.set_physical_page_for_file('barfoo',
                                     f2,
                                     order='301',
                                     orderlabel="page 301")
     self.assertEqual(f2.pageId, 'barfoo', 'pageId changed')
     self.assertEqual(len(mets.file_groups), 1, '1 file group')
Ejemplo n.º 7
0
def test_unique_identifier_from_nothing():
    mets = OcrdMets.empty_mets(datetime.now().isoformat())
    assert mets.unique_identifier == None, 'no identifier'
    mets.unique_identifier = 'foo'
    assert mets.unique_identifier == 'foo', 'Right identifier after change is "foo"'
    as_string = mets.to_xml().decode('utf-8')
    assert 'ocrd/core v%s' % VERSION in as_string
    assert 'CREATEDATE="%04u-%02u-%02uT' % (datetime.now().year, datetime.now().month, datetime.now().day,) in as_string
Ejemplo n.º 8
0
 def test_page_from_file_no_existe(self):
     with self.assertRaisesRegex(FileNotFoundError,
                                 "File not found: 'no-existe'"):
         mets = OcrdMets.empty_mets()
         ocrd_file = mets.add_file('FOO',
                                   ID='foo',
                                   local_filename='no-existe',
                                   mimetype='foo/bar')
         page_from_file(ocrd_file)
Ejemplo n.º 9
0
def test_fptr_changed_for_change_id():
    mets = OcrdMets.empty_mets()
    f1 = mets.add_file('FOO',
                       ID='FOO_1',
                       mimetype='image/tiff',
                       pageId='p0001')
    assert mets.get_physical_pages(for_fileIds=['FOO_1']) == ['p0001']
    f1.ID = 'BAZ_1'
    assert mets.get_physical_pages(for_fileIds=['FOO_1']) == [None]
    assert mets.get_physical_pages(for_fileIds=['BAZ_1']) == ['p0001']
Ejemplo n.º 10
0
 def __init__(self, resolver, directory, mets=None, mets_basename='mets.xml', automatic_backup=False, baseurl=None):
     self.resolver = resolver
     self.directory = directory
     self.mets_target = str(Path(directory, mets_basename))
     self.overwrite_mode = False
     if mets is None:
         mets = OcrdMets(filename=self.mets_target)
     self.mets = mets
     self.automatic_backup = automatic_backup
     self.baseurl = baseurl
Ejemplo n.º 11
0
def test_ocrd_file_equality():
    mets = OcrdMets.empty_mets()
    f1 = mets.add_file('FOO', ID='FOO_1', mimetype='image/tiff')
    f2 = mets.add_file('FOO', ID='FOO_2', mimetype='image/tiff')
    assert f1 != f2
    f3 = create_ocrd_file_with_defaults(ID='TEMP_1', mimetype='image/tiff')
    f4 = create_ocrd_file_with_defaults(ID='TEMP_1', mimetype='image/tif')
    # be tolerant of different equivalent mimetypes
    assert f3 == f4
    f5 = mets.add_file('TEMP', ID='TEMP_1', mimetype='image/tiff')
    assert f3 == f5
Ejemplo n.º 12
0
 def test_unique_identifier_from_nothing(self):
     mets = OcrdMets.empty_mets(datetime.now().isoformat())
     self.assertEqual(mets.unique_identifier, None, 'no identifier')
     mets.unique_identifier = 'foo'
     self.assertEqual(mets.unique_identifier, 'foo', 'Right identifier after change')
     as_string = mets.to_xml().decode('utf-8')
     self.assertIn('ocrd/core v%s' % VERSION, as_string)
     self.assertIn('CREATEDATE="%04u-%02u-%02uT' % (
         datetime.now().year,
         datetime.now().month,
         datetime.now().day,
     ), as_string)
Ejemplo n.º 13
0
def read_from_mets(metsfile,
                   filegrp,
                   page_ids,
                   outputfile,
                   pagelabel='pageId',
                   overwrite=False):
    overwrite = overwrite == 'true'
    mets = OcrdMets(filename=metsfile)
    inputfiles = []
    pagelabels = []
    metadata = get_metadata(mets)
    for f in mets.find_files(mimetype='application/pdf',
                             fileGrp=filegrp,
                             pageId=(page_ids or None)):
        # ignore multipaged pdfs
        if f.pageId:
            inputfiles.append(f.local_filename)
            if pagelabel != "pagenumber":
                pagelabels.append(getattr(f, pagelabel, ""))
    log = getLogger('processor.pagetopdf')
    if not inputfiles:
        log.warning("No PDF input files for merging %s", outputfile)
        return None
    if pdfmerge(inputfiles,
                outputfile,
                pagelabels=pagelabels,
                metadata=metadata):
        mets.add_file(filegrp,
                      mimetype='application/pdf',
                      ID=outputfile,
                      url=str(Path(filegrp).joinpath(outputfile + '.pdf')),
                      force=overwrite)
        with atomic_write(metsfile, overwrite=True) as f:
            f.write(mets.to_xml(xmllint=True).decode('utf-8'))
Ejemplo n.º 14
0
 def test_make_file_id_744(self):
     """
     https://github.com/OCR-D/core/pull/744
     > Often file IDs have two numbers, one of which will clash. In that case only the numerical fallback works.
     """
     mets = OcrdMets.empty_mets()
     f = mets.add_file('GRP2',
                       ID='img1796-97_00000024_img',
                       pageId='phys0024')
     f = mets.add_file('GRP2',
                       ID='img1796-97_00000025_img',
                       pageId='phys0025')
     self.assertEqual(make_file_id(f, 'GRP2'), 'GRP2_0002')
Ejemplo n.º 15
0
 def test_make_file_id_mets(self):
     mets = OcrdMets.empty_mets()
     for i in range(1, 10):
         mets.add_file('FOO', ID="FOO_%04d" % (i), mimetype="image/tiff")
         mets.add_file('BAR', ID="BAR_%04d" % (i), mimetype="image/tiff")
     self.assertEqual(make_file_id(mets.find_files(ID='BAR_0007')[0], 'FOO'), 'FOO_0007')
     f = mets.add_file('ABC', ID="BAR_7", mimetype="image/tiff")
     self.assertEqual(make_file_id(f, 'FOO'), 'FOO_0010')
     mets.remove_file(fileGrp='FOO')
     self.assertEqual(make_file_id(f, 'FOO'), 'FOO_0001')
     mets.add_file('FOO', ID="FOO_0001", mimetype="image/tiff")
     # print('\n'.join(['%s' % of for of in mets.find_files()]))
     self.assertEqual(make_file_id(f, 'FOO'), 'FOO_0002')
Ejemplo n.º 16
0
 def test_ocrd_file_eq(self):
     mets = OcrdMets.empty_mets()
     f1 = mets.add_file('FOO', ID='FOO_1', mimetype='image/tiff')
     self.assertEqual(f1 == f1, True)
     self.assertEqual(f1 != f1, False)
     f2 = mets.add_file('FOO', ID='FOO_2', mimetype='image/tiff')
     self.assertEqual(f1 == f2, False)
     f3 = create_ocrd_file_with_defaults(ID='TEMP_1', mimetype='image/tiff')
     f4 = create_ocrd_file_with_defaults(ID='TEMP_1', mimetype='image/tif')
     # be tolerant of different equivalent mimetypes
     self.assertEqual(f3 == f4, True)
     f5 = mets.add_file('TEMP', ID='TEMP_1', mimetype='image/tiff')
     self.assertEqual(f3 == f5, True)
Ejemplo n.º 17
0
 def test_remove_file_group0(self):
     """
     Test removal of filegrp
     """
     with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as tempdir:
         mets = OcrdMets(filename=join(tempdir, 'mets.xml'))
         self.assertEqual(len(mets.file_groups), 17)
         self.assertEqual(len(mets.find_all_files()), 35)
         #  print()
         #  before = sorted([x.ID for x in mets.find_all_files()])
         with self.assertRaisesRegex(Exception, "not empty"):
             mets.remove_file_group('OCR-D-GT-ALTO')
         mets.remove_file_group('OCR-D-GT-PAGE', recursive=True)
         #  print([x for x in before if x not in sorted([x.ID for x in mets.find_all_files()])])
         self.assertEqual(len(mets.file_groups), 16)
         self.assertEqual(len(mets.find_all_files()), 33)
Ejemplo n.º 18
0
 def test_rename_file_group0(self):
     with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as tempdir:
         mets = OcrdMets(filename=join(tempdir, 'mets.xml'))
         with self.assertRaisesRegex(FileNotFoundError, "No such fileGrp 'FOOBAR'"):
             mets.rename_file_group('FOOBAR', 'FOOBAR')
         assert 'FOOBAR' not in mets.file_groups
         mets.rename_file_group('OCR-D-GT-PAGE', 'FOOBAR')
         assert 'OCR-D-GT-PAGE' not in mets.file_groups
         assert 'FOOBAR' in mets.file_groups
Ejemplo n.º 19
0
    def workspace_from_nothing(self, directory, mets_basename='mets.xml', clobber_mets=False):
        """
        Create an empty workspace.
        """
        if directory is None:
            directory = tempfile.mkdtemp(prefix=TMP_PREFIX)
        Path(directory).mkdir(parents=True, exist_ok=True)
        mets_path = Path(directory, mets_basename)
        if mets_path.exists() and not clobber_mets:
            raise FileExistsError("METS '%s' already exists in '%s' and clobber_mets not set." % (mets_basename, directory))
        mets = OcrdMets.empty_mets()
        log.info("Writing METS to %s", mets_path)
        mets_path.write_bytes(mets.to_xml(xmllint=True))

        return Workspace(self, directory, mets)
Ejemplo n.º 20
0
def test_add_file():
    mets = OcrdMets.empty_mets()
    assert len(mets.file_groups) == 0, '0 file groups'
    assert len(list(mets.find_all_files(fileGrp='OUTPUT'))) == 0, '0 files in "OUTPUT"'
    f = mets.add_file('OUTPUT', ID="foo123", mimetype="bla/quux", pageId="foobar")
    f2 = mets.add_file('OUTPUT', ID="foo1232", mimetype="bla/quux", pageId="foobar")
    assert f.pageId == 'foobar', 'pageId set'
    assert len(mets.file_groups) == 1, '1 file groups'
    assert len(list(mets.find_all_files(fileGrp='OUTPUT'))) == 2, '2 files in "OUTPUT"'
    mets.set_physical_page_for_file('barfoo', f, order='300', orderlabel="page 300")
    assert f.pageId == 'barfoo', 'pageId changed'
    mets.set_physical_page_for_file('quux', f2, order='302', orderlabel="page 302")
    assert f2.pageId == 'quux', 'pageId changed'
    mets.set_physical_page_for_file('barfoo', f2, order='301', orderlabel="page 301")
    assert f2.pageId == 'barfoo', 'pageId changed'
    assert len(mets.file_groups) == 1, '1 file group'
Ejemplo n.º 21
0
 def test_remove_file_group_regex(self):
     """
     Test removal of filegrp
     """
     with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as tempdir:
         mets = OcrdMets(filename=join(tempdir, 'mets.xml'))
         self.assertEqual(len(mets.file_groups), 17)
         self.assertEqual(len(mets.find_all_files()), 35)
         mets.remove_file_group('//OCR-D-GT-.*', recursive=True)
         self.assertEqual(len(mets.file_groups), 15)
         self.assertEqual(len(mets.find_all_files()), 31)
Ejemplo n.º 22
0
 def __init__(self,
              resolver,
              directory,
              mets=None,
              mets_basename='mets.xml',
              automatic_backup=False,
              baseurl=None):
     self.resolver = resolver
     self.directory = directory
     self.mets_target = str(Path(directory, mets_basename))
     if mets is None:
         mets = OcrdMets(filename=self.mets_target)
     self.mets = mets
     self.automatic_backup = automatic_backup
     self.baseurl = baseurl
     #  print(mets.to_xml(xmllint=True).decode('utf-8'))
     self.image_cache = {
         'pil': {},
         'cv2': {},
         'exif': {},
     }
Ejemplo n.º 23
0
    def workspace_from_nothing(self,
                               directory,
                               mets_basename='mets.xml',
                               clobber_mets=False):
        """
        Create an empty workspace.
        """
        if directory is None:
            directory = tempfile.mkdtemp(prefix=TMP_PREFIX)
        if not exists(directory):
            makedirs(directory)

        mets_fpath = join(directory, mets_basename)
        if not clobber_mets and exists(mets_fpath):
            raise Exception("Not clobbering existing mets.xml in '%s'." %
                            directory)
        mets = OcrdMets.empty_mets()
        with open(mets_fpath, 'wb') as fmets:
            log.info("Writing %s", mets_fpath)
            fmets.write(mets.to_xml(xmllint=True))

        return Workspace(self, directory, mets)
Ejemplo n.º 24
0
def read_from_mets(metsfile, filegrp, outputfile, pagelabel='pageId'):
    mets = OcrdMets(filename=metsfile)
    inputfiles = []
    pagelabels = []
    metadata = get_metadata(mets)
    for f in mets.find_files(mimetype='application/pdf', fileGrp=filegrp):
        # ingore mulitpaged pdfs
        if f.pageId:
            inputfiles.append(f.local_filename)
            if pagelabel != "pagenumber":
                pagelabels.append(getattr(f, pagelabel, ""))
    if inputfiles:
        if not pdfmerge(
                inputfiles, outputfile, pagelabels=pagelabels,
                metadata=metadata):
            mets.add_file(filegrp,
                          mimetype='application/pdf',
                          ID=outputfile,
                          url=str(Path(filegrp).joinpath(outputfile + '.pdf')))
            with atomic_write(metsfile, overwrite=True) as f:
                f.write(mets.to_xml(xmllint=True).decode('utf-8'))
    return None
Ejemplo n.º 25
0
 def setUp(self):
     super().setUp()
     self.mets = OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'))
Ejemplo n.º 26
0
 def test_physical_pages_for_fileids(self):
     with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as tempdir:
         mets = OcrdMets(filename=join(tempdir, 'mets.xml'))
         self.assertEqual(mets.get_physical_pages(for_fileIds=['FILE_0002_IMAGE']), ['PHYS_0002'])
Ejemplo n.º 27
0
 def test_physical_pages_from_empty_mets(self):
     mets = OcrdMets(content="<mets></mets>")
     self.assertEqual(len(mets.physical_pages), 0, 'no physical page')
     mets.add_file('OUTPUT', ID="foo123", pageId="foobar")
     self.assertEqual(len(mets.physical_pages), 1, '1 physical page')
Ejemplo n.º 28
0
 def test_str(self):
     mets = OcrdMets(content='<mets/>')
     self.assertEqual(str(mets), 'OcrdMets[fileGrps=[],files=[]]')
Ejemplo n.º 29
0
 def test_nocontent_nofilename(self):
     with self.assertRaisesRegex(Exception, "Must pass 'filename' or 'content' to"):
         OcrdMets()
Ejemplo n.º 30
0
 def test_remove_file_regex(self):
     with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as tempdir:
         mets = OcrdMets(filename=join(tempdir, 'mets.xml'))
         self.assertEqual(mets.physical_pages, ['PHYS_0001', 'PHYS_0002', 'PHYS_0005'])
         mets.remove_file('//FILE_0005.*')
         self.assertEqual(mets.physical_pages, ['PHYS_0001', 'PHYS_0002'])