def testProcessorProfiling(self): initLogging() log_capture_string = FIFOIO(256) ch = logging.StreamHandler(log_capture_string) ch.setFormatter(logging.Formatter(LOG_FORMAT)) getLogger('ocrd.process.profile').setLevel('DEBUG') getLogger('ocrd.process.profile').addHandler(ch) run_processor( DummyProcessor, resolver=Resolver(), mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml')) log_contents = log_capture_string.getvalue() log_capture_string.close() # with open('/tmp/debug.log', 'w') as f: # f.write(log_contents) # Check whether profile information has been logged. Dummy should finish in under 0.1s self.assertTrue( match(r'.*Executing processor \'ocrd-test\' took 0.\d+s.*', log_contents))
def _page_get_lines(pcgts): LOG = getLogger('processor.FSTCorrection') lines = [] n_regions = pcgts.get_Page().get_AllRegions(classes=['Text'], order='reading-order') if not n_regions: LOG.warning('Page contains no text regions') for n_region in n_regions: n_lines = n_region.get_TextLine() if not n_lines: LOG.warning("Region '%s' contains no text lines", n_region.id) lines.extend(n_lines) return lines
def _validate_page_xsd(self): """ Validate all PAGE-XML files against PAGE XSD schema """ log = getLogger('ocrd.workspace_validator') log.debug("Validating all PAGE-XML files against XSD") for ocrd_file in self.mets.find_files(mimetype=MIMETYPE_PAGE): self.workspace.download_file(ocrd_file) for err in XsdPageValidator.validate(Path( ocrd_file.local_filename)).errors: self.report.add_error("%s: %s" % (ocrd_file.ID, err)) log.debug("Finished alidating all PAGE-XML files against XSD")
def _fix_segment(segment, page_id, reverse=False): """Fix order of child elements of (region/line/word) segment.""" LOG = getLogger('processor.RepairInconsistencies') if isinstance(segment, TextRegionType): joiner = '\n' sort_horizontal = False children = segment.get_TextLine() adoption = segment.set_TextLine elif isinstance(segment, TextLineType): joiner = ' ' sort_horizontal = True children = segment.get_Word() adoption = segment.set_Word elif isinstance(segment, WordType): joiner = '' sort_horizontal = True children = segment.get_Glyph() adoption = segment.set_Glyph else: raise Exception('invalid element type %s of segment to fix' % type(segment)) if not children: return segment_text = get_text(segment) concat_text = get_text(children, joiner) if (segment_text and concat_text and segment_text != concat_text and segment_text.replace(joiner, '') != concat_text.replace( joiner, '')): def polygon_position(child, horizontal=sort_horizontal): polygon = Polygon(polygon_from_points(child.get_Coords().points)) if horizontal: return polygon.centroid.x return polygon.centroid.y sorted_children = sorted(children, reverse=reverse, key=polygon_position) sorted_concat_text = get_text(sorted_children, joiner) if (segment_text == sorted_concat_text or segment_text.replace( joiner, '') == sorted_concat_text.replace(joiner, '')): LOG.info('Fixing element order of page "%s" segment "%s"', page_id, segment.id) adoption(sorted_children) else: LOG.debug( 'Resorting children of page "%s" segment "%s" from %s to %s' + 'does not suffice to turn "%s" into "%s"', page_id, segment.id, str([seg.id for seg in children]), str([seg.id for seg in sorted_children]), concat_text, segment_text)
def _copy_impl(self, src_filename, filename, progress_cb=None): log = getLogger('ocrd.resource_manager._copy_impl') log.info("Copying %s" % src_filename) with open(filename, 'wb') as f_out, open(src_filename, 'rb') as f_in: while True: chunk = f_in.read(4096) if chunk: f_out.write(chunk) if progress_cb: progress_cb(len(chunk)) else: break
def test_logging_really_non_duplicate(self): initLogging() child_logger = getLogger('a.b') print(child_logger) parent_logger = getLogger('a') root_logger = getLogger('') self.assertFalse(root_logger.propagate, 'root logger should not propagate') self.assertTrue(parent_logger.propagate, 'parent has no handler => do propagate') self.assertTrue(child_logger.propagate, 'child no handler => do propagate') root_capture = FIFOIO(256) root_handler = logging.StreamHandler(root_capture) root_handler.setFormatter( logging.Formatter(fmt=LOG_FORMAT, datefmt=LOG_TIMEFMT)) root_logger.addHandler(root_handler) parent_capture = FIFOIO(256) parent_handler = logging.StreamHandler(parent_capture) parent_handler.setFormatter( logging.Formatter(fmt=LOG_FORMAT, datefmt=LOG_TIMEFMT)) parent_logger.addHandler(parent_handler) # parent_logger = getLogger('a') # self.assertFalse(parent_logger.propagate, 'parent has handler now => do not propagate') self.assertTrue(child_logger.propagate, 'child has still no handler => do propagate') child_logger.error('test') root_str = root_capture.getvalue() parent_str = parent_capture.getvalue() print('root_str=%s' % root_str) print('parent_str=%s' % parent_str) self.assertEqual(root_str.count('\n'), 0) self.assertEqual(parent_str.count('\n'), 1)
def download( self, executable, url, basedir, overwrite=False, no_subdir=False, name=None, resource_type='file', path_in_archive='.', progress_cb=None, size=None, ): """ Download a resource by URL """ log = getLogger('ocrd.resource_manager.download') destdir = Path(basedir) if no_subdir else Path(basedir, executable) if not name: url_parsed = urlparse(url) name = Path(unquote(url_parsed.path)).name fpath = Path(destdir, name) is_url = url.startswith('https://') or url.startswith('http://') if fpath.exists() and not overwrite: log.info( "%s to be %s to %s which already exists and overwrite is False" % (url, 'downloaded' if is_url else 'copied', fpath)) return fpath destdir.mkdir(parents=True, exist_ok=True) if resource_type == 'file': if is_url: self._download_impl(url, fpath, progress_cb) else: self._copy_impl(url, fpath, progress_cb) elif resource_type == 'tarball': with pushd_popd(tempdir=True): if is_url: self._download_impl(url, 'download.tar.xx', progress_cb, size) else: self._copy_impl(url, 'download.tar.xx', progress_cb) Path('out').mkdir() with pushd_popd('out'): log.info("Extracting tarball") with open_tarfile('../download.tar.xx', 'r:*') as tar: tar.extractall() log.info("Copying '%s' from tarball to %s" % (path_in_archive, fpath)) copytree(path_in_archive, str(fpath)) # TODO # elif resource_type == 'github-dir': return fpath
def spill(self, src, dest): """ Spill a workspace, i.e. unpack it and turn it into a workspace. See https://ocr-d.github.com/ocrd_zip#unpacking-ocrd-zip-to-a-workspace Arguments: src (string): Path to OCRD-ZIP dest (string): Path to directory to unpack data folder to """ log = getLogger('ocrd.workspace_bagger') if exists(dest) and not isdir(dest): raise Exception("Not a directory: %s" % dest) # If dest is an existing directory, try to derive its name from src if isdir(dest): workspace_name = re.sub(r'(\.ocrd)?\.zip$', '', basename(src)) new_dest = join(dest, workspace_name) if exists(new_dest): raise Exception("Directory exists: %s" % new_dest) dest = new_dest log.info("Spilling %s to %s", src, dest) bagdir = mkdtemp(prefix=TMP_BAGIT_PREFIX) unzip_file_to_dir(src, bagdir) datadir = join(bagdir, 'data') for root, _, files in walk(datadir): for f in files: srcfile = join(root, f) destdir = join(dest, relpath(root, datadir)) destfile = join(destdir, f) if not exists(destdir): makedirs(destdir) log.debug("Copy %s -> %s", srcfile, destfile) copyfile(srcfile, destfile) # TODO copy allowed tag files if present # TODO validate bagit # Drop tempdir rmtree(bagdir) # Create workspace workspace = Workspace(self.resolver, directory=dest) # TODO validate workspace return workspace
def process(self): """Segment pages into regions using a Mask R-CNN model.""" assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) LOG = getLogger('processor.AnybaseocrBlockSegmenter') if not tf.test.is_gpu_available(): LOG.warning( "Tensorflow cannot detect CUDA installation. Running without GPU will be slow." ) for input_file in self.input_files: pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() page_id = input_file.pageId or input_file.ID # todo rs: why not cropped? page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, feature_filter='binarized,deskewed,cropped,clipped,non_text') # try to load pixel masks try: # todo rs: this combination only works for tiseg with use_deeplr=true mask_image, _, _ = self.workspace.image_from_page( page, page_id, feature_selector='clipped', feature_filter='binarized,deskewed,cropped,non_text') except: mask_image = None if page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) else: dpi = None self._process_segment(page_image, page, page_xywh, page_id, input_file, mask_image, dpi) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts).encode('utf-8'))
def process(self): assert_file_grp_cardinality(self.input_file_grp, 2, "GT and OCR") assert_file_grp_cardinality(self.output_file_grp, 1) log = getLogger("processor.OcrdDinglehopperEvaluate") metrics = self.parameter["metrics"] textequiv_level = self.parameter["textequiv_level"] gt_grp, ocr_grp = self.input_file_grp.split(",") input_file_tuples = self.zip_input_files(on_error='abort') for n, (gt_file, ocr_file) in enumerate(input_file_tuples): if not gt_file or not ocr_file: # file/page was not found in this group continue gt_file = self.workspace.download_file(gt_file) ocr_file = self.workspace.download_file(ocr_file) page_id = gt_file.pageId log.info("INPUT FILES %i / %s↔ %s", n, gt_file, ocr_file) file_id = make_file_id(ocr_file, self.output_file_grp) report_prefix = os.path.join(self.output_file_grp, file_id) # Process the files try: os.mkdir(self.output_file_grp) except FileExistsError: pass cli_process( gt_file.local_filename, ocr_file.local_filename, report_prefix, metrics=metrics, textequiv_level=textequiv_level, ) # Add reports to the workspace for report_suffix, mimetype in [ [".html", "text/html"], [".json", "application/json"], ]: self.workspace.add_file( ID=file_id + report_suffix, file_grp=self.output_file_grp, pageId=page_id, mimetype=mimetype, local_filename=report_prefix + report_suffix, ) # Clear cache between files levenshtein_matrix_cache_clear()
def run_tasks(mets, log_level, page_id, task_strs): resolver = Resolver() workspace = resolver.workspace_from_url(mets) log = getLogger('ocrd.task_sequence') tasks = [ProcessorTask.parse(task_str) for task_str in task_strs] for task in tasks: task.validate() # check input file groups are in mets for input_file_grp in task.input_file_grps: if not input_file_grp in workspace.mets.file_groups: raise Exception( "Unmet requirement: expected input file group not contained in mets: %s" % input_file_grp) for output_file_grp in task.output_file_grps: if output_file_grp in workspace.mets.file_groups: raise Exception( "Conflict: output file group already contained in mets: %s" % output_file_grp) log.info("Start processing task '%s'", task) # execute cli returncode = run_cli(task.executable, mets, resolver, workspace, log_level=log_level, page_id=page_id, input_file_grp=','.join(task.input_file_grps), output_file_grp=','.join(task.output_file_grps), parameter=task.parameter_path) # check return code if returncode != 0: raise Exception("%s exited with non-zero return value %s" % (task.executable, returncode)) log.info("Finished processing task '%s'", task) # reload mets workspace.reload_mets() # check output file groups are in mets for output_file_grp in task.output_file_grps: if not output_file_grp in workspace.mets.file_groups: raise Exception( "Invalid state: expected output file group not in mets: %s" % output_file_grp)
def _line_to_tokens(self, n_line): LOG = getLogger('processor.FSTCorrection') result = [] n_words = n_line.get_Word() if not n_words: LOG.warning("Line '%s' contains no word", n_line.id) for n_word in n_words: n_textequivs = n_word.get_TextEquiv() if n_textequivs and n_textequivs[0].Unicode: result.append(n_textequivs[0].Unicode) else: LOG.warning("Word '%s' contains no text results", n_word.id) return result
def run_cli( executable, mets_url=None, resolver=None, workspace=None, page_id=None, overwrite=None, log_level=None, input_file_grp=None, output_file_grp=None, parameter=None, working_dir=None, ): """ Open a workspace and run a processor on the command line. If :py:attr:`workspace` is not none, reuse that. Otherwise, instantiate an :py:class:`~ocrd.Workspace` for :py:attr:`mets_url` (and :py:attr:`working_dir`) by using :py:meth:`ocrd.Resolver.workspace_from_url` (i.e. open or clone local workspace). Run the processor CLI :py:attr:`executable` on the workspace, passing: - the workspace, - :py:attr:`page_id` - :py:attr:`input_file_grp` - :py:attr:`output_file_grp` - :py:attr:`parameter` (after applying any :py:attr:`parameter_override` settings) (Will create output files and update the in the filesystem). Args: executable (string): Executable name of the module processor. """ workspace = _get_workspace(workspace, resolver, mets_url, working_dir) args = [executable, '--working-dir', workspace.directory] args += ['--mets', mets_url] if log_level: args += ['--log-level', log_level] if page_id: args += ['--page-id', page_id] if input_file_grp: args += ['--input-file-grp', input_file_grp] if output_file_grp: args += ['--output-file-grp', output_file_grp] if parameter: args += ['--parameter', parameter] if overwrite: args += ['--overwrite'] log = getLogger('ocrd.processor.helpers.run_cli') log.debug("Running subprocess '%s'", ' '.join(args)) result = run(args, check=False) return result.returncode
def check_file_grp(workspace, input_file_grp=None, output_file_grp=None, page_id=None, report=None): """ Return a report on whether input_file_grp is/are in workspace.mets and output_file_grp is/are not. To be run before processing Arguments: workspacec (Workspace) the workspace to validate input_file_grp (list|string) list or comma-separated list of input file groups output_file_grp (list|string) list or comma-separated list of output file groups page_id (list|string) list or comma-separated list of page_ids to write to """ if not report: report = ValidationReport() if isinstance(input_file_grp, str): input_file_grp = input_file_grp.split( ',') if input_file_grp else [] if isinstance(output_file_grp, str): output_file_grp = output_file_grp.split( ',') if output_file_grp else [] if page_id and isinstance(page_id, str): page_id = page_id.split(',') log = getLogger('ocrd.workspace_validator') log.debug("input_file_grp=%s output_file_grp=%s" % (input_file_grp, output_file_grp)) if input_file_grp: for grp in input_file_grp: if grp not in workspace.mets.file_groups: report.add_error("Input fileGrp[@USE='%s'] not in METS!" % grp) if output_file_grp: for grp in output_file_grp: if grp in workspace.mets.file_groups: if page_id: for one_page_id in page_id: if next( workspace.mets.find_files( fileGrp=grp, pageId=one_page_id), None): report.add_error( "Output fileGrp[@USE='%s'] already contains output for page %s" % (grp, one_page_id)) else: report.add_error( "Output fileGrp[@USE='%s'] already in METS!" % grp) return report
def _page_for_id_lazy(self, page_id: str, file_group: str = None) -> Optional['Page']: log = getLogger('ocrd_browser.model.document.Document.page_for_id') if not page_id: return None page = LazyPage(self, page_id, file_group) if not page.file: log.warning( "No PAGE-XML and no image for page '{}' in fileGrp '{}'". format(page_id, file_group)) return None return page
def launch(self, toolname: str, doc: Document, file: OcrdFile) -> Optional[Popen]: # type: ignore[type-arg] if toolname in self.tools: return self.launch_tool(self.tools[toolname], doc, file) else: log = getLogger('ocrd_browser.util.launcher.Launcher.launch') log.error( 'Tool "%s" not found in your config, to fix place the following section in your ocrd-browser.conf', toolname) log.error('[Tool %s]', toolname) log.error( 'commandline = /usr/bin/yourtool --base-dir {workspace.directory} {file.path.absolute}' ) return None
def _repair_tokenisation(tokenisation, concatenation, next_token): LOG = getLogger('processor.KerasRate') # invariant: text should contain a representation that concatenates into actual tokenisation # ideally, both overlap (concatenation~tokenisation) i = 0 for i in range(min(len(tokenisation), len(concatenation)), -1, -1): if concatenation[-i:] == tokenisation[:i]: break if i > 0 and tokenisation[i:].startswith( next_token): # without white space? LOG.warning('Repairing tokenisation between "%s" and "%s"', concatenation[-i:], next_token) return True # repair by skipping space/newline here return False
def create_logmap_smlink(self, workspace): LOG = getLogger('OcrdAnybaseocrLayoutAnalyser') el_root = self.workspace.mets._tree.getroot() log_map = el_root.find('mets:structMap[@TYPE="LOGICAL"]', NS) if log_map is None: log_map = ET.SubElement(el_root, TAG_METS_STRUCTMAP) log_map.set('TYPE', 'LOGICAL') else: LOG.info('LOGICAL structMap already exists, adding to it') link = el_root.find('mets:structLink', NS) if link is None: link = ET.SubElement(el_root, TAG_METS_STRUCTLINK) self.link = link self.log_map = log_map
def handle_oai_response(response): """ In case of a valid OAI-Response, extract first METS-Entry-Data """ log = getLogger('ocrd_models.utils.handle_oai_response') content_type = response.headers['Content-Type'] if 'xml' in content_type or 'text' in content_type: content = response.content try: if is_oai_content(content): return extract_mets_from_oai_content(content) except ET.LxmlError as exc: log.warning("textual response but no xml: %s (%s)", content, exc) return response.content
def validate(filename=None, ocrd_page=None, ocrd_file=None, page_textequiv_consistency='strict', page_textequiv_strategy='first', check_baseline=True, check_coords=True): """ Validates a PAGE file for consistency by filename, OcrdFile or passing OcrdPage directly. Arguments: filename (string): Path to PAGE ocrd_page (OcrdPage): OcrdPage instance ocrd_file (OcrdFile): OcrdFile instance wrapping OcrdPage page_textequiv_consistency (string): 'strict', 'lax', 'fix' or 'off' page_textequiv_strategy (string): Currently only 'first' check_baseline (bool): whether Baseline must be fully within TextLine/Coords check_coords (bool): whether *Region/TextLine/Word/Glyph must each be fully contained within Border/*Region/TextLine/Word, resp. Returns: report (:class:`ValidationReport`) Report on the validity """ log = getLogger('ocrd.page_validator.validate') if ocrd_page: page = ocrd_page file_id = ocrd_page.get_pcGtsId() elif ocrd_file: page = page_from_file(ocrd_file) file_id = ocrd_file.ID elif filename: page = parse(filename, silence=True) file_id = filename else: raise Exception( "At least one of ocrd_page, ocrd_file or filename must be set") if page_textequiv_strategy not in ('first'): raise Exception("page_textequiv_strategy %s not implemented" % page_textequiv_strategy) if page_textequiv_consistency not in ('strict', 'lax', 'fix', 'off'): raise Exception( "page_textequiv_consistency level %s not implemented" % page_textequiv_consistency) report = ValidationReport() log.info("Validating input file '%s'", file_id) validate_consistency(page, page_textequiv_consistency, page_textequiv_strategy, check_baseline, check_coords, report, file_id) return report
def process(self): LOG = getLogger('OcrdAnybaseocrLayoutAnalyser') if not tf.test.is_gpu_available(): LOG.error("Your system has no CUDA installed. No GPU detected.") # sys.exit(1) assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) model_path = Path(self.parameter['model_path']) class_mapper_path = Path(self.parameter['class_mapping_path']) if not Path(model_path).is_file(): LOG.error("""\ Layout Classfication model was not found at '%s'. Make sure the `model_path` parameter points to the local model path. model can be downloaded from http://url """ % model_path) sys.exit(1) else: LOG.info('Loading model from file %s', model_path) model = self.create_model(str(model_path)) # load the mapping pickle_in = open(str(class_mapper_path), "rb") class_indices = pickle.load(pickle_in) label_mapping = dict((v, k) for k, v in class_indices.items()) # print("INPUT FILE HERE",self.input_files) for (n, input_file) in enumerate(self.input_files): pcgts = page_from_file(self.workspace.download_file(input_file)) fname = pcgts.get_Page().imageFilename page_id = input_file.pageId or input_file.ID size = 600, 500 self.add_metadata(pcgts) page = pcgts.get_Page() LOG.info("INPUT FILE %s", input_file.pageId or input_file.ID) page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized') img_array = ocrolib.pil2array( page_image.resize((500, 600), Image.ANTIALIAS)) img_array = img_array * 1. / 255. img_array = img_array[np.newaxis, :, :, np.newaxis] results = self.start_test(model, img_array, fname, label_mapping) LOG.info(results) self.workspace.mets.set_physical_page_for_file( "PHYS_000" + str(n), input_file) self.create_logmap_smlink(pcgts) self.write_to_mets(results, "PHYS_000" + str(n))
def resolve_mets_arguments(self, directory, mets_url, mets_basename): """ Resolve the ``--mets``, ``--mets-basename`` and `--directory`` argument into a coherent set of arguments according to https://github.com/OCR-D/core/issues/517 """ log = getLogger('ocrd.resolver.resolve_mets_arguments') mets_is_remote = mets_url and (mets_url.startswith('http://') or mets_url.startswith('https://')) # XXX we might want to be more strict like this but it might break # legacy code # Allow --mets and --directory together iff --mets is a remote URL # if directory and mets_url and not mets_is_remote: # raise ValueError("Use either --mets or --directory, not both") # If --mets is a URL, a directory must be explicitly provided (not strictly necessary, but retained for legacy behavior) if not directory and mets_is_remote: raise ValueError("--mets is an http(s) URL but no --directory was given") # Determine --mets-basename if not mets_basename and mets_url: mets_basename = Path(mets_url).name elif not mets_basename and not mets_url: mets_basename = 'mets.xml' elif mets_basename and mets_url: raise ValueError("Use either --mets or --mets-basename, not both") else: warn("--mets-basename is deprecated. Use --mets/--directory instead", DeprecationWarning) # Determine --directory and --mets-url if not directory and not mets_url: directory = Path.cwd() mets_url = Path(directory, mets_basename) elif directory and not mets_url: directory = Path(directory).resolve() mets_url = directory / mets_basename elif not directory and mets_url: mets_url = Path(mets_url).resolve() directory = mets_url.parent else: # == directory and mets_url: directory = Path(directory).resolve() if not mets_is_remote: # --mets is just a basename and --directory is set, so treat --mets as --mets-basename if Path(mets_url).parent == Path('.'): mets_url = directory / mets_url else: mets_url = Path(mets_url).resolve() if not is_file_in_directory(directory, mets_url): raise ValueError("--mets '%s' has a directory part inconsistent with --directory '%s'" % (mets_url, directory)) return str(Path(directory).resolve()), str(mets_url), str(mets_basename)
def run_processor( processorClass, ocrd_tool=None, mets_url=None, resolver=None, workspace=None, page_id=None, log_level=None, # TODO actually use this! input_file_grp=None, output_file_grp=None, parameter=None, working_dir=None, ): # pylint: disable=too-many-locals """ Create a workspace for mets_url and run processor through it Args: parameter (string): URL to the parameter """ workspace = _get_workspace(workspace, resolver, mets_url, working_dir) log.debug("Running processor %s", processorClass) processor = processorClass(workspace, ocrd_tool=ocrd_tool, page_id=page_id, input_file_grp=input_file_grp, output_file_grp=output_file_grp, parameter=parameter) ocrd_tool = processor.ocrd_tool name = '%s v%s' % (ocrd_tool['executable'], processor.version) otherrole = ocrd_tool['steps'][0] logProfile = getLogger('ocrd.process.profile') log.debug("Processor instance %s (%s doing %s)", processor, name, otherrole) t0 = time() processor.process() t1 = time() - t0 logProfile.info( 'Executing processor "%s" took %fs [--input-file-grp="%s" --output-file-grp="%s" --parameter="%s"]' % (ocrd_tool['executable'], t1, input_file_grp if input_file_grp else '', output_file_grp if output_file_grp else '', parameter if parameter else {})) workspace.mets.add_agent(name=name, _type='OTHER', othertype='SOFTWARE', role='OTHER', otherrole=otherrole) workspace.save_mets() return processor
def setup(self): LOG = getLogger('processor.AnybaseocrBlockSegmenter') #self.reading_order = [] self.order = 0 model_path = resource_filename(__name__, '../mrcnn') model_weights = Path( self.resolve_resource( self.parameter['block_segmentation_weights'])) confidence = self.parameter['min_confidence'] config = InferenceConfig(confidence) self.mrcnn_model = model.MaskRCNN(mode="inference", model_dir=str(model_path), config=config) self.mrcnn_model.load_weights(str(model_weights), by_name=True)
def _clone_workspace(cls, mets_url: Union[Path, str]) -> Workspace: """ Clones a workspace (mets.xml and all used files) to a temporary directory for editing """ log = getLogger( 'ocrd_browser.model.document.Document._clone_workspace') mets_url = cls._strip_local(mets_url, disallow_remote=False) temporary_workspace = mkdtemp(prefix='browse-ocrd-clone-') cls.temporary_workspaces.append(temporary_workspace) # TODO download = False and lazy loading would be nice for responsiveness log.info("Cloning '%s' to '%s'", mets_url, temporary_workspace) workspace = Resolver().workspace_from_url(mets_url=mets_url, dst_dir=temporary_workspace, download=True) return workspace
def process(self): LOG = getLogger('ocrd.dummy') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) for input_file in self.input_files: input_file = self.workspace.download_file(input_file) file_id = make_file_id(input_file, self.output_file_grp) ext = MIME_TO_EXT.get(input_file.mimetype, '') local_filename = join(self.output_file_grp, file_id + ext) pcgts = page_from_file(self.workspace.download_file(input_file)) pcgts.set_pcGtsId(file_id) self.add_metadata(pcgts) LOG.info("cp %s %s # %s -> %s", input_file.url, local_filename, input_file.ID, file_id) if input_file.mimetype == MIMETYPE_PAGE: # Source file is PAGE-XML: Write out in-memory PcGtsType self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=input_file.mimetype, local_filename=local_filename, content=to_xml(pcgts).encode('utf-8')) else: # Source file is not PAGE-XML: Copy byte-by-byte with open(input_file.local_filename, 'rb') as f: content = f.read() self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=input_file.mimetype, local_filename=local_filename, content=content) if input_file.mimetype.startswith('image/'): # write out the PAGE-XML representation for this image page_file_id = file_id + '_PAGE' pcgts.set_pcGtsId(page_file_id) pcgts.get_Page().set_imageFilename(local_filename) page_filename = join(self.output_file_grp, file_id + '.xml') LOG.info("Add PAGE-XML %s generated for %s at %s", page_file_id, file_id, page_filename) self.workspace.add_file( ID=page_file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=page_filename, content=to_xml(pcgts).encode('utf-8'))
def process(self): LOG = getLogger('OcrdAnybaseocrTextline') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) oplevel = self.parameter['operation_level'] for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() LOG.info("INPUT FILE %s", input_file.pageId or input_file.ID) page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized,deskewed') if oplevel == 'page': LOG.warning("Operation level should be region.") self._process_segment(page_image, page, None, page_xywh, page_id, input_file, n) else: regions = page.get_TextRegion() if not regions: LOG.warning("Page '%s' contains no text regions", page_id) continue for (k, region) in enumerate(regions): region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh) self._process_segment(region_image, page, region, region_xywh, region.id, input_file, k) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts).encode('utf-8'))
def __init__(self, userdir=None, xdg_config_home=None, xdg_data_home=None): self.log = getLogger('ocrd.resource_manager') self.database = {} self._xdg_data_home = xdg_data_home self._xdg_config_home = xdg_config_home self._userdir = userdir self.user_list = Path(self.xdg_config_home, 'ocrd', 'resources.yml') self.load_resource_list(Path(RESOURCE_LIST_FILENAME)) if not self.user_list.exists(): if not self.user_list.parent.exists(): self.user_list.parent.mkdir(parents=True) with open(str(self.user_list), 'w', encoding='utf-8') as f: f.write(RESOURCE_USER_LIST_COMMENT) self.load_resource_list(self.user_list)
def workspace_init(ctx, clobber_mets, directory): """ Create a workspace with an empty METS file in --directory. """ LOG = getLogger('ocrd.cli.workspace.init') if directory: LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR init' instead of argument 'DIRECTORY' ('%s')" % directory)) ctx.directory = directory workspace = ctx.resolver.workspace_from_nothing( directory=os.path.abspath(ctx.directory), mets_basename=basename(ctx.mets_url), clobber_mets=clobber_mets ) workspace.save_mets() print(workspace.directory)
def xmllint_format(xml): """ Pretty-print XML like ``xmllint`` does. Arguments: xml (string): Serialized XML """ log = getLogger('ocrd_models.utils.xmllint_format') parser = ET.XMLParser(resolve_entities=False, strip_cdata=False, remove_blank_text=True) document = ET.fromstring(xml, parser) return ('%s\n%s' % ('<?xml version="1.0" encoding="UTF-8"?>', ET.tostring(document, pretty_print=True, encoding='UTF-8').decode('utf-8'))).encode('utf-8')