def test_validate_ocrd_file(self): resolver = Resolver() workspace = resolver.workspace_from_url(assets.url_of('glyph-consistency/data/mets.xml')) with pushd_popd(workspace.directory): ocrd_file = workspace.mets.find_all_files(ID="FAULTY_GLYPHS_FILE")[0] report = PageValidator.validate(ocrd_file=ocrd_file) self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 17, '17 textequiv consistency errors')
def test_tmpConfigfile(self): self.assertNotEqual( logging.getLogger('').getEffectiveLevel(), logging.NOTSET) with TemporaryDirectory() as tempdir: with pushd_popd(tempdir): with open('ocrd_logging.conf', 'w') as f: # write logging configuration file (MWE) f.write(''' [loggers] keys=root [handlers] keys=consoleHandler [formatters] keys= [logger_root] level=ERROR handlers=consoleHandler [handler_consoleHandler] class=StreamHandler formatter= args=(sys.stdout,) ''') # this will call logging.config.fileConfig with disable_existing_loggers=True, # so the defaults from the import-time initLogging should be invalided initLogging() # ensure log level is set from temporary config file self.assertEqual( logging.getLogger('').getEffectiveLevel(), logging.ERROR)
def download_file(self, f, _recursion_count=0): """ Download a :py:mod:`ocrd.model.ocrd_file.OcrdFile` to the workspace. """ log = getLogger('ocrd.workspace.download_file') log.debug('download_file %s [_recursion_count=%s]' % (f, _recursion_count)) with pushd_popd(self.directory): try: # If the f.url is already a file path, and is within self.directory, do nothing url_path = Path(f.url).resolve() if not (url_path.exists() and url_path.relative_to(str(Path(self.directory).resolve()))): raise Exception("Not already downloaded, moving on") except Exception as e: basename = '%s%s' % (f.ID, MIME_TO_EXT.get(f.mimetype, '')) if f.ID else f.basename try: f.url = self.resolver.download_to_directory(self.directory, f.url, subdir=f.fileGrp, basename=basename) except FileNotFoundError as e: if not self.baseurl: raise Exception("No baseurl defined by workspace. Cannot retrieve '%s'" % f.url) if _recursion_count >= 1: raise Exception("Already tried prepending baseurl '%s'. Cannot retrieve '%s'" % (self.baseurl, f.url)) log.debug("First run of resolver.download_to_directory(%s) failed, try prepending baseurl '%s': %s", f.url, self.baseurl, e) f.url = '%s/%s' % (self.baseurl, f.url) f.url = self.download_file(f, _recursion_count + 1).local_filename f.local_filename = f.url return f
def remove_file_group(self, USE, recursive=False, force=False, keep_files=False, page_recursive=False, page_same_group=False): """ Remove a fileGrp. Arguments: USE (string): USE attribute of the fileGrp to delete recursive (boolean): Whether to recursively delete all files in the group force (boolean): Continue removing even if group or containing files not found in METS keep_files (boolean): When deleting recursively whether to keep files on disk page_recursive (boolean): Whether to remove all images referenced in the file if the file is a PAGE-XML document. page_same_group (boolean): Remove only images in the same file group as the PAGE-XML. Has no effect unless ``page_recursive`` is ``True``. """ if not force and self.overwrite_mode: force = True if (not USE.startswith(REGEX_PREFIX)) and (USE not in self.mets.file_groups) and (not force): raise Exception("No such fileGrp: %s" % USE) file_dirs = [] if recursive: for f in self.mets.find_files(fileGrp=USE): self.remove_file(f, force=force, keep_file=keep_files, page_recursive=page_recursive, page_same_group=page_same_group) file_dirs.append(path.dirname(f.local_filename)) self.mets.remove_file_group(USE, force=force) # PLEASE NOTE: this only removes directories in the workspace if they are empty # and named after the fileGrp which is a convention in OCR-D. with pushd_popd(self.directory): if Path(USE).is_dir() and not listdir(USE): Path(USE).rmdir() if file_dirs: for file_dir in set(file_dirs): if Path(file_dir).is_dir() and not listdir(file_dir): Path(file_dir).rmdir()
def remove_file_group(self, USE, recursive=False, force=False, keep_files=False): """ Remove a fileGrp. Arguments: USE (string): USE attribute of the fileGrp to delete recursive (boolean): Whether to recursively delete all files in the group force (boolean): Continue removing even if group or containing files not found in METS keep_files (boolean): When deleting recursively whether to keep files on disk """ if USE not in self.mets.file_groups and not force: raise Exception("No such fileGrp: %s" % USE) if recursive: for f in self.mets.find_files(fileGrp=USE): self.remove_file(f.ID, force=force, keep_file=keep_files) if USE in self.mets.file_groups: self.mets.remove_file_group(USE) # XXX this only removes directories in the workspace if they are empty # and named after the fileGrp which is a convention in OCR-D. with pushd_popd(self.directory): if Path(USE).is_dir() and not listdir(USE): Path(USE).rmdir()
def test_resolve_image_as_pil(self): with pushd_popd(assets.path_to('kant_aufklaerung_1784/data/')): ws = self.resolver.workspace_from_url('mets.xml') img = ws.resolve_image_as_pil('OCR-D-IMG/INPUT_0017.tif') self.assertEqual(img.width, 1457) img = ws.resolve_image_as_pil('OCR-D-IMG/INPUT_0017.tif', coords=([100, 100], [50, 50])) self.assertEqual(img.width, 50)
def _validate(self): """ Actual validation. """ try: self._resolve_workspace() except Exception as e: # pylint: disable=broad-except log.warning("Failed to instantiate workspace: %s", e) self.report.add_error("Failed to instantiate workspace: %s" % e) return self.report with pushd_popd(self.workspace.directory): try: if 'mets_unique_identifier' not in self.skip: self._validate_mets_unique_identifier() if 'mets_file_group_names' not in self.skip: self._validate_mets_file_group_names() if 'mets_files' not in self.skip: self._validate_mets_files() if 'pixel_density' not in self.skip: self._validate_pixel_density() if 'multipage' not in self.skip: self._validate_multipage() if 'dimension' not in self.skip: self._validate_dimension() if 'imagefilename' not in self.skip: self._validate_imagefilename() if 'page' not in self.skip: self._validate_page() except Exception: self.report.add_error("Validation aborted with exception: %s" % format_exc()) return self.report
def do_the_update(bagdir, non_local_urls=False): directory = Path(bagdir, 'data') if not Path(directory, 'mets.xml').exists(): LOG.error("Something's wrong with OCRD-ZIP at %s, no data/mets.xml!", bagdir) return workspace = Workspace(resolver, directory=str(directory)) with pushd_popd(directory): for f in workspace.mets.find_files(): fp = Path(f.url) if not fp.exists() and not non_local_urls: LOG.debug("Skipping non-local file: %s", fp) continue ext = MIME_TO_EXT.get(f.mimetype) if not ext: LOG.error( "No rule to translate '%s' to an extension. Skipping %s", f.mimetype, fp) continue if fp.suffix == ext: LOG.debug("Already has the right extension, %s", fp.name) continue if fp.suffix and fp.suffix in EXT_TO_MIME and fp.suffix != ext: LOG.warning("Has the WRONG extension, is '%s' should be '%s'", fp.suffix, ext) f.url = f.url[:-len(fp.suffix)] LOG.info('Renaming %s{,%s}', fp, ext) f.url = "%s%s" % (f.url, ext) if fp.exists(): fp.rename('%s%s' % (fp, ext)) workspace.save_mets() LOG.debug('Running bagit update script') update_checksums(bagdir) LOG.info("FINISHED: %s", bagdir)
def _resolve_image_as_pil(self, image_url, coords=None): """ Resolve an image URL to a PIL image. Args: - coords (list) : Coordinates of the bounding box to cut from the image Returns: Image or region in image as PIL.Image """ log = getLogger('ocrd.workspace._resolve_image_as_pil') files = self.mets.find_files(url=image_url) f = files[0] if files else OcrdFile(None, url=image_url) image_filename = self.download_file(f).local_filename with pushd_popd(self.directory): pil_image = Image.open(image_filename) pil_image.load() # alloc and give up the FD if coords is None: return pil_image log.debug("Converting PIL to OpenCV: %s", image_url) color_conversion = cv2.COLOR_GRAY2BGR if pil_image.mode in ( '1', 'L') else cv2.COLOR_RGB2BGR pil_as_np_array = np.array(pil_image).astype( 'uint8') if pil_image.mode == '1' else np.array(pil_image) cv2_image = cv2.cvtColor(pil_as_np_array, color_conversion) poly = np.array(coords, np.int32) log.debug("Cutting region %s from %s", coords, image_url) region_cut = cv2_image[np.min(poly[:, 1]):np.max(poly[:, 1]), np.min(poly[:, 0]):np.max(poly[:, 0])] return Image.fromarray(region_cut)
def test_dimensions(self): with TemporaryDirectory() as tempdir: wsdir = join(tempdir, 'foo') copytree(assets.path_to('kant_aufklaerung_1784/data'), wsdir) with pushd_popd(wsdir): os.system( """sed -i 's,imageHeight="2083",imageHeight="1234",' OCR-D-GT-PAGE/PAGE_0017_PAGE.xml""" ) report = WorkspaceValidator.validate( self.resolver, join(wsdir, 'mets.xml'), src_dir=wsdir, skip=[ 'page', 'mets_unique_identifier', 'mets_file_group_names', 'mets_files', 'pixel_density', 'imagefilename', 'page_xsd', 'mets_xsd' ], download=True) self.assertIn( "PAGE 'PAGE_0017_PAGE': @imageHeight != image's actual height (1234 != 2083)", report.errors) # print(report.errors) self.assertEqual(len(report.errors), 1) self.assertEqual(report.is_valid, False) report2 = WorkspaceValidator.validate( self.resolver, join(wsdir, 'mets.xml'), src_dir=wsdir, skip=[ 'page', 'mets_unique_identifier', 'mets_file_group_names', 'mets_files', 'pixel_density', 'imagefilename', 'dimension', 'page_xsd', 'mets_xsd' ], download=False) self.assertEqual(report2.is_valid, True)
def add_file(self, file_grp, content=None, **kwargs): """ Add an output file. Creates an :class:`OcrdFile` to pass around and adds that to the OcrdMets OUTPUT section. """ log.debug('outputfile file_grp=%s local_filename=%s content=%s', file_grp, kwargs.get('local_filename'), content is not None) if content is not None and 'local_filename' not in kwargs: raise Exception("'content' was set but no 'local_filename'") with pushd_popd(self.directory): if 'local_filename' in kwargs: local_filename_dir = kwargs['local_filename'].rsplit('/', 1)[0] if not Path(local_filename_dir).is_dir(): makedirs(local_filename_dir) if 'url' not in kwargs: kwargs['url'] = kwargs['local_filename'] # print(kwargs) ret = self.mets.add_file(file_grp, **kwargs) if content is not None: with open(kwargs['local_filename'], 'wb') as f: if isinstance(content, str): content = bytes(content, 'utf-8') f.write(content) return ret
def test_bulk_add_stdin(self): resolver = Resolver() with pushd_popd(tempdir=True) as wsdir: ws = resolver.workspace_from_nothing(directory=wsdir) Path(wsdir, 'BIN').mkdir() Path(wsdir, 'BIN/FILE_0001_BIN.IMG-wolf.png').write_text('') Path(wsdir, 'BIN/FILE_0002_BIN.IMG-wolf.png').write_text('') Path(wsdir, 'BIN/FILE_0001_BIN.xml').write_text('') Path(wsdir, 'BIN/FILE_0002_BIN.xml').write_text('') with mock_stdin( 'PHYS_0001 BIN FILE_0001_BIN.IMG-wolf BIN/FILE_0001_BIN.IMG-wolf.png BIN/FILE_0001_BIN.IMG-wolf.png image/png\n' 'PHYS_0002 BIN FILE_0002_BIN.IMG-wolf BIN/FILE_0002_BIN.IMG-wolf.png BIN/FILE_0002_BIN.IMG-wolf.png image/png\n' 'PHYS_0001 BIN FILE_0001_BIN BIN/FILE_0001_BIN.xml BIN/FILE_0001_BIN.xml application/vnd.prima.page+xml\n' 'PHYS_0002 BIN FILE_0002_BIN BIN/FILE_0002_BIN.xml BIN/FILE_0002_BIN.xml application/vnd.prima.page+xml\n' ): assert len(ws.mets.file_groups) == 0 exit_code, out, err = self.invoke_cli(workspace_cli, [ 'bulk-add', '-r', r'(?P<pageid>.*) (?P<filegrp>.*) (?P<fileid>.*) (?P<src>.*) (?P<dest>.*) (?P<mimetype>.*)', '-G', '{{ filegrp }}', '-g', '{{ pageid }}', '-i', '{{ fileid }}', '-m', '{{ mimetype }}', '-u', "{{ dest }}", '-' ]) ws.reload_mets() assert len(ws.mets.file_groups) == 1 assert len(list(ws.mets.find_files())) == 4 f = next(ws.mets.find_files()) assert f.mimetype == 'image/png' assert f.ID == 'FILE_0001_BIN.IMG-wolf' assert f.url == 'BIN/FILE_0001_BIN.IMG-wolf.png'
def prune_files(ctx, file_grp, mimetype, page_id, file_id): """ Removes mets:files that point to non-existing local files (If any ``FILTER`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url), automatic_backup=ctx.automatic_backup) with pushd_popd(workspace.directory): for f in workspace.mets.find_files( ID=file_id, fileGrp=file_grp, mimetype=mimetype, pageId=page_id, ): try: if not f.local_filename or not exists(f.local_filename): workspace.mets.remove_file(f.ID) except Exception as e: ctx.log.exception("Error removing %f: %s", f, e) raise (e) workspace.save_mets()
def test_mets_basename_and_mets(self): with pushd_popd(tempdir=True) as tempdir: with self.assertRaisesRegex( ValueError, "Use either --mets or --mets-basename, not both"): self.invoke_cli(workspace_cli, ['-m', 'foo.xml', '-M', 'not-foo.xml', 'init'])
def files_for_page_id(self, page_id: str, file_group: str = DEFAULT_FILE_GROUP, mimetype: str = None) \ -> List[OcrdFile]: with pushd_popd(self.workspace.directory): files: List[OcrdFile] = self.workspace.mets.find_files( fileGrp=file_group, pageId=page_id, mimetype=mimetype) files = [self.workspace.download_file(file) for file in files] return files
def test_add_519(self): """ https://github.com/OCR-D/core/issues/519 """ with TemporaryDirectory() as tempdir: wsdir = Path(tempdir, "workspace") wsdir.mkdir() srcdir = Path(tempdir, "source") srcdir.mkdir() srcfile = Path(srcdir, "srcfile.jpg") srcfile_content = 'foo' srcfile.write_text(srcfile_content) with pushd_popd(str(wsdir)): exit_code, out, err = self.invoke_cli(workspace_cli, ['init']) exit_code, out, err = self.invoke_cli(workspace_cli, [ 'add', '-m', 'image/jpg', '-G', 'MAX', '-i', 'IMG_MAX_1818975', '-C', str(srcfile) ]) # print(out, err) self.assertEqual(exit_code, 0) self.assertTrue(Path(wsdir, 'MAX', 'srcfile.jpg').exists()) self.assertEqual( Path(wsdir, 'MAX', 'srcfile.jpg').read_text(), srcfile_content)
def test_mets_get_id_set_id(self): with pushd_popd(tempdir=True): self.invoke_cli(workspace_cli, ['init']) mets_id = 'foo123' self.invoke_cli(workspace_cli, ['set-id', mets_id]) _, out, _ = self.invoke_cli(workspace_cli, ['get-id']) self.assertEqual(out, mets_id + '\n')
def test_bulk_add(self): NO_FILES=100 with TemporaryDirectory() as srcdir: Path(srcdir, "OCR-D-IMG").mkdir() Path(srcdir, "OCR-D-PAGE").mkdir() for i in range(NO_FILES): Path(srcdir, "OCR-D-IMG", "page_%04d.tif" % i).write_text('') for i in range(NO_FILES): Path(srcdir, "OCR-D-PAGE", "page_%04d.xml" % i).write_text('') with TemporaryDirectory() as wsdir: with pushd_popd(wsdir): ws = self.resolver.workspace_from_nothing(directory=wsdir) exit_code, out, err = self.invoke_cli(workspace_cli, [ 'bulk-add', '--ignore', '--regex', r'^.*/(?P<fileGrp>[^/]+)/page_(?P<pageid>.*)\.(?P<ext>[^\.]*)$', '--url', '{{ fileGrp }}/FILE_{{ pageid }}.{{ ext }}', '--file-id', 'FILE_{{ fileGrp }}_{{ pageid }}', '--page-id', 'PHYS_{{ pageid }}', '--file-grp', '{{ fileGrp }}', '%s/*/*' % srcdir ]) # print('exit_code', exit_code) # print('out', out) # print('err', err) ws.reload_mets() self.assertEqual(len(ws.mets.file_groups), 2) self.assertEqual(len(ws.mets.find_all_files()), 2 * NO_FILES) self.assertEqual(len(ws.mets.find_all_files(mimetype='image/tiff')), NO_FILES) self.assertEqual(len(ws.mets.find_all_files(ID='//FILE_OCR-D-IMG_000.*')), 10) self.assertEqual(len(ws.mets.find_all_files(ID='//FILE_.*_000.*')), 20) self.assertEqual(len(ws.mets.find_all_files(pageId='PHYS_0001')), 2) self.assertEqual(ws.mets.find_all_files(ID='FILE_OCR-D-PAGE_0001')[0].url, 'OCR-D-PAGE/FILE_0001.xml')
def test_bulk_add_gen_id(self): with pushd_popd(tempdir=True) as wsdir: ws = self.resolver.workspace_from_nothing(directory=wsdir) Path(wsdir, 'c').write_text('') _, out, err = self.invoke_cli( workspace_cli, [ 'bulk-add', '-r', r'(?P<pageid>.*) (?P<filegrp>.*) (?P<src>.*) (?P<url>.*) (?P<mimetype>.*)', '-G', '{{ filegrp }}', '-g', '{{ pageid }}', '-S', '{{ src }}', # '-i', '{{ fileid }}', # XXX skip --file-id '-m', '{{ mimetype }}', '-u', "{{ url }}", 'a b c d e' ]) ws.reload_mets() assert next(ws.mets.find_files()).ID == 'a_b_c_d_e' assert next(ws.mets.find_files()).url == 'd'
def test_mets_basename(self): with TemporaryDirectory() as tempdir: with pushd_popd(tempdir): result = self.runner.invoke(workspace_cli, ['-m', 'foo.xml', 'init']) self.assertEqual(result.exit_code, 0) self.assertTrue(exists('foo.xml')) self.assertFalse(exists('mets.xml'))
def download_file(self, f, _recursion_count=0): """ Download a :py:mod:`ocrd.model.ocrd_file.OcrdFile` to the workspace. """ log.debug('download_file %s [_recursion_count=%s]' % (f, _recursion_count)) with pushd_popd(self.directory): # XXX FIXME hacky basename = '%s%s' % (f.ID, MIME_TO_EXT.get( f.mimetype, '')) if f.ID else f.basename try: f.url = self.resolver.download_to_directory(self.directory, f.url, subdir=f.fileGrp, basename=basename) except FileNotFoundError as e: if not self.baseurl: raise Exception( "No baseurl defined by workspace. Cannot retrieve '%s'" % f.url) if _recursion_count >= 1: raise Exception( "Already tried prepending baseurl '%s'. Cannot retrieve '%s'" % (self.baseurl, f.url)) log.debug( "First run of resolver.download_to_directory(%s) failed, try prepending baseurl '%s': %s", f.url, self.baseurl, e) f.url = '%s/%s' % (self.baseurl, f.url) f.url = self.download_file(f, _recursion_count + 1).local_filename # XXX FIXME HACK f.local_filename = f.url return f
def test_bulk_add_missing_param(self): with pushd_popd(tempdir=True) as wsdir: ws = self.resolver.workspace_from_nothing(directory=wsdir) with pytest.raises(ValueError, match=r"OcrdFile attribute 'pageId' unset"): _, out, err = self.invoke_cli( workspace_cli, [ 'bulk-add', '-r', r'(?P<pageid>.*) (?P<filegrp>.*) (?P<fileid>.*) (?P<src>.*) (?P<url>.*) (?P<mimetype>.*)', '-G', '{{ filegrp }}', # '-g', '{{ pageid }}', # XXX skip --page-id '-i', '{{ fileid }}', '-m', '{{ mimetype }}', '-u', "{{ url }}", 'a b c d e f', '1 2 3 4 5 6' ]) print('out', out) print('err', err) assert 0
def remove_file(self, ID, force=False, keep_file=False): """ Remove a file from the workspace. Arguments: ID (string|OcrdFile): ID of the file to delete or the file itself force (boolean): Continue removing even if file not found in METS keep_file (boolean): Whether to keep files on disk """ log.debug('Deleting mets:file %s', ID) try: ocrd_file = self.mets.remove_file(ID) if not keep_file: if not ocrd_file.local_filename: log.warning("File not locally available %s", ocrd_file) if not force: raise Exception("File not locally available %s" % ocrd_file) else: with pushd_popd(self.directory): log.info("rm %s [cwd=%s]", ocrd_file.local_filename, self.directory) unlink(ocrd_file.local_filename) return ocrd_file except FileNotFoundError as e: if not force: raise e
def test_workspace_from_url_rel_dir(self): with TemporaryDirectory() as dst_dir: bogus_dst_dir = '../../../../../../../../../../../../../../../../%s' % dst_dir[1:] with pushd_popd(FOLDER_KANT): ws1 = self.resolver.workspace_from_url('data/mets.xml', dst_dir=bogus_dst_dir) self.assertEqual(ws1.mets_target, pjoin(dst_dir, 'mets.xml')) self.assertEqual(ws1.directory, dst_dir)
def test_mets_basename_and_not_mets(self): with pushd_popd(tempdir=True) as tempdir: _, out, err = self.invoke_cli( workspace_cli, ['-d', 'foo', '-M', 'not-foo.xml', 'init']) self.assertEqual(out, join(tempdir, 'foo') + '\n') self.assertIn( '--mets-basename is deprecated. Use --mets/--directory instead', err)
def test_find_all_files(self): with TemporaryDirectory() as tempdir: wsdir = join(tempdir, 'ws') copytree(assets.path_to('SBB0000F29300010000/data'), wsdir) with pushd_popd(wsdir): result = self.runner.invoke(workspace_cli, ['find', '-G', 'OCR-D-IMG-BIN', '-k', 'fileGrp']) self.assertEqual(result.output, 'OCR-D-IMG-BIN\nOCR-D-IMG-BIN\n') self.assertEqual(result.exit_code, 0)
def test_mets_directory_incompatible(self): with pushd_popd(tempdir=True) as tempdir: with self.assertRaisesRegex( ValueError, "--mets has a directory part inconsistent with --directory" ): self.invoke_cli(workspace_cli, ['-d', 'foo', '-m', '/somewhere/else', 'init'])
def test_processor_run(self): with copy_of_directory( assets.path_to('SBB0000F29300010000/data')) as tempdir: with pushd_popd(tempdir): result = self.runner.invoke( cli_dummy_processor, ['-p', '{"foo": 42}', '--mets', 'mets.xml']) self.assertEqual(result.exit_code, 0)
def test_mets_directory_html(self): with pushd_popd(tempdir=True) as tempdir: with self.assertRaisesRegex( ValueError, r"--mets is an http\(s\) URL but no --directory was given" ): self.invoke_cli(workspace_cli, ['-m', 'https://foo.bar/bla', 'init'])
def test_parameter_override_wo_param(self): with copy_of_directory( assets.path_to('SBB0000F29300010000/data')) as tempdir: with pushd_popd(tempdir): code, out, err = self.invoke_cli( cli_dummy_processor, ['-P', 'baz', 'two', *DEFAULT_IN_OUT]) print(out) self.assertEqual(out, '{"baz": "two"}\n')