def main(gt, ocr, report_prefix, metrics, textequiv_level, progress): """ Compare the PAGE/ALTO/text document GT against the document OCR. dinglehopper detects if GT/OCR are ALTO or PAGE XML documents to extract their text and falls back to plain text if no ALTO or PAGE is detected. The files GT and OCR are usually a ground truth document and the result of an OCR software, but you may use dinglehopper to compare two OCR results. In that case, use --no-metrics to disable the then meaningless metrics and also change the color scheme from green/red to blue. The comparison report will be written to $REPORT_PREFIX.{html,json}, where $REPORT_PREFIX defaults to "report". The reports include the character error rate (CER) and the word error rate (WER). By default, the text of PAGE files is extracted on 'region' level. You may use "--textequiv-level line" to extract from the level of TextLine tags. """ initLogging() Config.progress = progress process(gt, ocr, report_prefix, metrics=metrics, textequiv_level=textequiv_level)
def test_handle_response_for_invalid_content(mock_get, response_dir): """If invalid content is returned, store warning log entry""" # arrange url = 'http://digital.bibliothek.uni-halle.de/hd/oai/?verb=GetRecord&metadataPrefix=mets&mode=xml&identifier=foo' mock_get.return_value.status_code = 200 mock_get.return_value.content = b'foo bar' headers = {'Content-Type': 'text/plain'} mock_get.return_value.headers = headers resolver = Resolver() initLogging() # capture log log = getLogger('ocrd_models.utils.handle_oai_response') capt = FIFOIO(256) sh = StreamHandler(capt) sh.setFormatter(Formatter(LOG_FORMAT)) log.addHandler(sh) # act resolver.download_to_directory(response_dir, url) # assert mock_get.assert_called_once_with(url) log_output = capt.getvalue() assert 'WARNING ocrd_models.utils.handle_oai_response' in log_output
def test_logging_really_non_duplicate(self): initLogging() child_logger = getLogger('a.b') print(child_logger) parent_logger = getLogger('a') root_logger = getLogger('') self.assertFalse(root_logger.propagate, 'root logger should not propagate') self.assertTrue(parent_logger.propagate, 'parent has no handler => do propagate') self.assertTrue(child_logger.propagate, 'child no handler => do propagate') root_capture = FIFOIO(256) root_handler = logging.StreamHandler(root_capture) root_handler.setFormatter(logging.Formatter(fmt=LOG_FORMAT, datefmt=LOG_TIMEFMT)) root_logger.addHandler(root_handler) parent_capture = FIFOIO(256) parent_handler = logging.StreamHandler(parent_capture) parent_handler.setFormatter(logging.Formatter(fmt=LOG_FORMAT, datefmt=LOG_TIMEFMT)) parent_logger.addHandler(parent_handler) # parent_logger = getLogger('a') # self.assertFalse(parent_logger.propagate, 'parent has handler now => do not propagate') self.assertTrue(child_logger.propagate, 'child has still no handler => do propagate') child_logger.error('test') root_str = root_capture.getvalue() parent_str = parent_capture.getvalue() print('root_str=%s' % root_str) print('parent_str=%s' % parent_str) self.assertEqual(root_str.count('\n'), 0) self.assertEqual(parent_str.count('\n'), 1)
def test_tmpConfigfile(self): self.assertNotEqual( logging.getLogger('').getEffectiveLevel(), logging.NOTSET) with TemporaryDirectory() as tempdir: with pushd_popd(tempdir): with open('ocrd_logging.conf', 'w') as f: # write logging configuration file (MWE) f.write(''' [loggers] keys=root [handlers] keys=consoleHandler [formatters] keys= [logger_root] level=ERROR handlers=consoleHandler [handler_consoleHandler] class=StreamHandler formatter= args=(sys.stdout,) ''') # this will call logging.config.fileConfig with disable_existing_loggers=True, # so the defaults from the import-time initLogging should be invalided initLogging() # ensure log level is set from temporary config file self.assertEqual( logging.getLogger('').getEffectiveLevel(), logging.ERROR)
def main() -> None: from ocrd_utils import initLogging initLogging() from ocrd_browser.application import OcrdBrowserApplication install_excepthook() app = OcrdBrowserApplication() app.run(sys.argv)
def bashlib_input_files(**kwargs): """ List input files for processing Instantiate a processor and workspace from the given processing options. Then loop through the input files of the input fileGrp, and for each one, print its `url`, `ID`, `mimetype` and `pageId`, as well as its recommended `outputFileId` (from ``make_file_id``). (The printing format is one associative array initializer per line.) """ initLogging() mets = kwargs.pop('mets') working_dir = kwargs.pop('working_dir') if is_local_filename(mets) and not isfile(get_local_filename(mets)): msg = "File does not exist: %s" % mets raise Exception(msg) resolver = Resolver() workspace = resolver.workspace_from_url(mets, working_dir) processor = Processor(workspace, ocrd_tool=None, page_id=kwargs['page_id'], input_file_grp=kwargs['input_file_grp'], output_file_grp=kwargs['output_file_grp']) for input_file in processor.input_files: for field in ['url', 'ID', 'mimetype', 'pageId']: # make this bash-friendly (show initialization for associative array) print("[%s]='%s'" % (field, getattr(input_file, field)), end=' ') print("[outputFileId]='%s'" % make_file_id(input_file, kwargs['output_file_grp']))
def process_cli(log_level, mets, page_id, tasks, overwrite): """ Process a series of tasks """ initLogging() log = getLogger('ocrd.cli.process') run_tasks(mets, log_level, page_id, tasks, overwrite) log.info("Finished")
def test_multiple_initLogging(self): disableLogging() initLogging() self.capture_out_err() initLogging() out = '\n'.join(self.capture_out_err()) assert 'initLogging was called multiple times' in out assert __file__ in out
def workspace_cli(ctx, directory, mets, mets_basename, backup): """ Working with workspace """ initLogging() ctx.obj = WorkspaceCtx(directory, mets_url=mets, mets_basename=mets_basename, automatic_backup=backup)
def test_setOverrideLogLevel(self): initLogging() rootLogger = logging.getLogger('') somelogger = getLogger('foo.bar') somelogger.setLevel(getLevelName('ERROR')) setOverrideLogLevel('ERROR') self.assertEqual(rootLogger.getEffectiveLevel(), logging.ERROR) self.assertEqual(somelogger.getEffectiveLevel(), logging.ERROR) notherlogger = getLogger('bar.foo') self.assertEqual(notherlogger.getEffectiveLevel(), logging.ERROR) setOverrideLogLevel('INFO') somelogger = getLogger('foo.bar')
def main(input_file, textequiv_level): """ Extract the text of the given INPUT_FILE. dinglehopper detects if INPUT_FILE is an ALTO or PAGE XML document to extract its text and falls back to plain text if no ALTO or PAGE is detected. By default, the text of PAGE files is extracted on 'region' level. You may use "--textequiv-level line" to extract from the level of TextLine tags. """ initLogging() input_text = extract(input_file, textequiv_level=textequiv_level).text print(input_text)
def test_logging_non_duplicate(self): """ Verify that child loggers don't propagate a log message they handle """ initLogging() root_logger = logging.getLogger('') self.assertTrue(root_logger.handlers, 'root logger has at least 1 handler') parent_capture = FIFOIO(256) parent_handler = logging.StreamHandler(parent_capture) parent_handler.setFormatter(logging.Formatter(fmt=LOG_FORMAT, datefmt=LOG_TIMEFMT)) parent_logger = getLogger('foo') self.assertTrue(parent_logger.propagate, 'no handler on logger => propagate') parent_logger.addHandler(parent_handler) parent_logger = getLogger('foo') self.assertFalse(parent_logger.propagate, 'should not propagate because StreamHandler has been attached') child_logger = getLogger('foo.bar') self.assertTrue(child_logger.propagate, 'no handler on logger => propagate') child_logger.setLevel('DEBUG') child_logger.error('first error') child_capture = FIFOIO(256) child_handler = logging.StreamHandler(child_capture) child_handler.setFormatter(logging.Formatter(fmt=LOG_FORMAT, datefmt=LOG_TIMEFMT)) child_logger.debug('first debug') child_logger.addHandler(child_handler) child_logger = getLogger('foo.bar') self.assertFalse(child_logger.propagate, 'should not propagate because StreamHandler has been attached') child_logger.debug('second debug') child_logger.error('second error') parent_output = parent_capture.getvalue() parent_capture.close() child_output = child_capture.getvalue() child_capture.close() # print('parent', parent_output) # print('child', child_output) self.assertTrue(match(TIMEFMT_RE + 'ERROR foo.bar - first error\n', parent_output), 'parent received first error but not second error nor first debug') self.assertTrue(match("\n".join([ TIMEFMT_RE + 'DEBUG foo.bar - second debug', TIMEFMT_RE + 'ERROR foo.bar - second error', ]), child_output), 'child received second error and debug but not first error and debug')
def testProcessorProfiling(self): initLogging() log_capture_string = FIFOIO(256) ch = logging.StreamHandler(log_capture_string) ch.setFormatter(logging.Formatter(LOG_FORMAT)) getLogger('ocrd.process.profile').setLevel('DEBUG') getLogger('ocrd.process.profile').addHandler(ch) run_processor(DummyProcessor, resolver=Resolver(), mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml')) log_contents = log_capture_string.getvalue() log_capture_string.close() # with open('/tmp/debug.log', 'w') as f: # f.write(log_contents) # Check whether profile information has been logged. Dummy should finish in under 0.1s self.assertTrue(match(r'.*Executing processor \'ocrd-test\' took 0.\d+s.*', log_contents))
def test_handle_common_oai_response(mock_get, response_dir, oai_response_content): """Base use case with valid OAI Response data""" initLogging() # arrange url = 'http://digital.bibliothek.uni-halle.de/hd/oai/?verb=GetRecord&metadataPrefix=mets&mode=xml&identifier=9049' mock_get.return_value.status_code = 200 mock_get.return_value.content = oai_response_content headers = {'Content-Type': 'text/xml'} mock_get.return_value.headers = headers resolver = Resolver() # act result = resolver.download_to_directory(response_dir, url) # assert mock_get.assert_called_once_with(url) assert result == 'oai'
def main(log_level, alto_version, check_words, check_border, skip_empty_lines, trailing_dash_to_hyp, dummy_textline, dummy_word, textequiv_index, textequiv_fallback_strategy, region_order, output_file, filename): """ Convert PAGE to ALTO """ initLogging() converter = OcrdPageAltoConverter( alto_version=alto_version, page_filename=filename, check_words=check_words, check_border=check_border, skip_empty_lines=skip_empty_lines, trailing_dash_to_hyp=trailing_dash_to_hyp, dummy_textline=dummy_textline, dummy_word=dummy_word, textequiv_index=textequiv_index, textequiv_fallback_strategy=textequiv_fallback_strategy, region_order=region_order) converter.convert() with open(1 if output_file == '-' else output_file, 'w') as output: output.write(str(converter))
def main(gt, ocr, report_prefix, metrics): """ Compare the GT line text directory against the OCR line text directory. This assumes that the GT line text directory contains textfiles with a common suffix like ".gt.txt", and the OCR line text directory contains textfiles with a common suffix like ".some-ocr.txt". The text files also need to be paired, i.e. the GT file "line001.gt.txt" needs to match a file "line001.some-ocr.txt" in the OCT lines directory. The GT and OCR directories are usually round truth line texts and the results of an OCR software, but you may use dinglehopper to compare two OCR results. In that case, use --no-metrics to disable the then meaningless metrics and also change the color scheme from green/red to blue. The comparison report will be written to $REPORT_PREFIX.{html,json}, where $REPORT_PREFIX defaults to "report". The reports include the character error rate (CER) and the word error rate (WER). """ initLogging() process(gt, ocr, report_prefix, metrics=metrics)
def main(image, out, model, save_images, save_layout, save_deskewed, save_all, enable_plotting, allow_enhancement, curved_line, full_layout, tables, input_binary, allow_scaling, headers_off, log_level): if log_level: setOverrideLogLevel(log_level) initLogging() if not enable_plotting and (save_layout or save_deskewed or save_all or save_images or allow_enhancement): print( "Error: You used one of -sl, -sd, -sa, -si or -ae but did not enable plotting with -ep" ) sys.exit(1) elif enable_plotting and not (save_layout or save_deskewed or save_all or save_images or allow_enhancement): print( "Error: You used -ep to enable plotting but set none of -sl, -sd, -sa, -si or -ae" ) sys.exit(1) eynollah = Eynollah( image_filename=image, dir_out=out, dir_models=model, dir_of_cropped_images=save_images, dir_of_layout=save_layout, dir_of_deskewed=save_deskewed, dir_of_all=save_all, enable_plotting=enable_plotting, allow_enhancement=allow_enhancement, curved_line=curved_line, full_layout=full_layout, tables=tables, input_binary=input_binary, allow_scaling=allow_scaling, headers_off=headers_off, ) pcgts = eynollah.run() eynollah.writer.write_pagexml(pcgts)
def log_cli(ctx, name, *args, **kwargs): """ Logging """ initLogging() ctx.obj = LogCtx(name)
def ocrd_cli_wrap_processor( processorClass, ocrd_tool=None, mets=None, working_dir=None, dump_json=False, help=False, # pylint: disable=redefined-builtin version=False, overwrite=False, **kwargs): if not sys.argv[1:]: processorClass(workspace=None, show_help=True) sys.exit(1) if dump_json or help or version: processorClass(workspace=None, dump_json=dump_json, show_help=help, show_version=version) sys.exit() else: initLogging() LOG = getLogger('ocrd_cli_wrap_processor') # LOG.info('kwargs=%s' % kwargs) # Merge parameter overrides and parameters if 'parameter_override' in kwargs: set_json_key_value_overrides(kwargs['parameter'], *kwargs['parameter_override']) # TODO OCR-D/core#274 # Assert -I / -O # if not kwargs['input_file_grp']: # raise ValueError('-I/--input-file-grp is required') # if not kwargs['output_file_grp']: # raise ValueError('-O/--output-file-grp is required') if is_local_filename(mets) and not isfile(get_local_filename(mets)): msg = "File does not exist: %s" % mets LOG.error(msg) raise Exception(msg) resolver = Resolver() workspace = resolver.workspace_from_url(mets, working_dir) page_id = kwargs.get('page_id') # XXX not possible while processors do not adhere to # https://github.com/OCR-D/core/issues/505 # if overwrite # if 'output_file_grp' not in kwargs or not kwargs['output_file_grp']: # raise Exception("--overwrite requires --output-file-grp") # LOG.info("Removing files because of --overwrite") # for grp in kwargs['output_file_grp'].split(','): # if page_id: # for one_page_id in kwargs['page_id'].split(','): # LOG.debug("Removing files in output file group %s with page ID %s", grp, one_page_id) # for file in workspace.mets.find_files(pageId=one_page_id, fileGrp=grp): # workspace.remove_file(file, force=True, keep_file=False, page_recursive=True) # else: # LOG.debug("Removing all files in output file group %s ", grp) # # TODO: can be reduced to `page_same_group=True` as soon as core#505 has landed (in all processors) # workspace.remove_file_group(grp, recursive=True, force=True, keep_files=False, page_recursive=True, page_same_group=False) # workspace.save_mets() # XXX While https://github.com/OCR-D/core/issues/505 is open, set 'overwrite_mode' globally on the workspace if overwrite: workspace.overwrite_mode = True report = WorkspaceValidator.check_file_grp( workspace, kwargs['input_file_grp'], '' if overwrite else kwargs['output_file_grp'], page_id) if not report.is_valid: raise Exception("Invalid input/output file grps:\n\t%s" % '\n\t'.join(report.errors)) run_processor(processorClass, ocrd_tool, mets, workspace=workspace, **kwargs)
def zip_cli(): """ Bag/Spill/Validate OCRD-ZIP bags """ initLogging()
def cli(output_file, normalization, gt_level, confusion, file_lists, gt_file, ocr_files): """Compare text lines by aligning and computing the textual distance and character error rate. This compares 1:n given PAGE-XML or plain text files. If `--file-lists` is given and files are plain text, then they will be interpreted as (newline-separated) lists of path names for single-line text files (for Ocropus convention). Writes a JSON report file to `--output-file`. (No error aggregation across files in this CLI.) """ initLogging() LOG = logging.getLogger(__name__) LOG.setLevel(logging.INFO) aligners = [ Alignment(logger=LOG, confusion=bool(confusion)) for _ in ocr_files ] edits = [Edits(logger=LOG) for _ in ocr_files] LOG.info("processing '%s'", gt_file) gt_lines = get_lines(gt_file, file_lists) if not gt_lines: LOG.critical("file '%s' contains no text lines to compare", gt_file) exit(1) report = dict() for i, ocr_file in enumerate(ocr_files): LOG.info("processing '%s'", ocr_file) ocr_lines = get_lines(ocr_file, file_lists) if not ocr_lines: LOG.error("file '%s' contains no text lines to compare", ocr_file) continue pair = gt_file + ',' + ocr_file if isinstance(ocr_lines, dict): # from PAGE-XML file line_ids = ocr_lines.keys() else: # from plain text file line_ids = range(len(ocr_lines)) for line_id in line_ids: lines = report.setdefault(pair, dict()).setdefault('lines', list()) if isinstance(gt_lines, dict): has_line = line_id in gt_lines else: has_line = line_id < len(gt_lines) if not has_line: LOG.error( "line '%s' in file '%s' is missing from GT file '%s'", str(line_id), ocr_file, gt_file) lines.append({line_id: 'missing'}) continue gt_line = gt_lines[line_id] ocr_line = ocr_lines[line_id] gt_len = len(gt_line) ocr_len = len(ocr_line) if 0.2 * (gt_len + ocr_len) < math.fabs(gt_len - ocr_len) > 5: LOG.warning( 'line "%s" in file "%s" deviates significantly in length (%d vs %d)', str(line_id), ocr_file, gt_len, ocr_len) if normalization == 'Levenshtein-fast': # not exact (but fast): codepoints dist = aligners[i].get_levenshtein_distance(ocr_line, gt_line) else: # exact (but slow): grapheme clusters dist = aligners[i].get_adjusted_distance( ocr_line, gt_line, # Levenshtein / NFC / NFKC / historic_latin normalization=normalization) edits[i].add(dist) lines.append({line_id: {'length': gt_len, 'distance': dist}}) # report results LOG.info("%5d lines %.3f±%.3f CER %s vs %s", edits[i].length, edits[i].mean, math.sqrt(edits[i].varia), ocr_file, gt_file) report[pair]['length'] = edits[i].length report[pair]['distance-mean'] = edits[i].mean report[pair]['distance-varia'] = edits[i].varia if confusion: if not edits[i].length: continue conf = aligners[i].get_confusion(confusion) LOG.info("most frequent confusion / %s vs %s: %s", gt_file, ocr_file, conf) report[pair]['confusion'] = repr(conf) if output_file == '-': output = sys.stdout else: output = open(output_file, 'w') json.dump(report, output, indent=2)
def setUp(self): initLogging()
def tearDown(self): initLogging()
def setUp(self): initLogging() self.resolver = Resolver()
def setUp(self): self.maxDiff = None self.resolver = Resolver() initLogging() self.runner = CliRunner()
def setUp(self): disableLogging() initLogging() self.resolver = Resolver() self.workspace = self.resolver.workspace_from_url( assets.url_of('SBB0000F29300010000/data/mets.xml'))
def setUp(self): self.mets = OcrdMets( filename=assets.url_of('SBB0000F29300010000/data/mets.xml')) initLogging()
__all__ = ['tf'] import os import warnings from ocrd_utils import initLogging, getLogger initLogging() getLogger('tensorflow').setLevel('ERROR') os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # No prints from the tensorflow side warnings.filterwarnings('ignore', category=FutureWarning) #import tensorflow as tf import tensorflow.compat.v1 as tf import tensorflow.keras as keras tf.disable_v2_behavior()
def __init__( self, workspace, ocrd_tool=None, parameter=None, # TODO OCR-D/core#274 # input_file_grp=None, # output_file_grp=None, input_file_grp="INPUT", output_file_grp="OUTPUT", page_id=None, show_resource=None, list_resources=False, show_help=False, show_version=False, dump_json=False, version=None): """ Instantiate, but do not process. Unless ``list_resources`` or ``show_resource`` or ``show_help`` or ``show_version`` or ``dump_json`` is true, setup for processing (parsing and validating parameters, entering the workspace directory). Args: workspace (:py:class:`~ocrd.Workspace`): The workspace to process. \ Can be ``None`` even for processing (esp. on multiple workspaces), \ but then needs to be set before running. Keyword Args: ocrd_tool (string): JSON of the ocrd-tool description for that processor. \ Can be ``None`` for processing, but needs to be set before running. parameter (string): JSON of the runtime choices for ocrd-tool ``parameters``. \ Can be ``None`` even for processing, but then needs to be set before running. input_file_grp (string): comma-separated list of METS ``fileGrp``s used for input. output_file_grp (string): comma-separated list of METS ``fileGrp``s used for output. page_id (string): comma-separated list of METS physical ``page`` IDs to process \ (or empty for all pages). show_resource (string): If not ``None``, then instead of processing, resolve \ given resource by name and print its contents to stdout. list_resources (boolean): If true, then instead of processing, find all installed \ resource files in the search paths and print their path names. show_help (boolean): If true, then instead of processing, print a usage description \ including the standard CLI and all of this processor's ocrd-tool parameters and \ docstrings. show_version (boolean): If true, then instead of processing, print information on \ this processor's version and OCR-D version. Exit afterwards. dump_json (boolean): If true, then instead of processing, print :py:attr:`ocrd_tool` \ on stdout. """ if parameter is None: parameter = {} if dump_json: print(json.dumps(ocrd_tool, indent=True)) return if list_resources: has_dirs, has_files = get_processor_resource_types(None, ocrd_tool) for res in list_all_resources(ocrd_tool['executable']): if Path(res).is_dir() and not has_dirs: continue if not Path(res).is_dir() and not has_files: continue print(res) return if show_resource: has_dirs, has_files = get_processor_resource_types(None, ocrd_tool) res_fname = list_resource_candidates(ocrd_tool['executable'], show_resource) if not res_fname: initLogging() logger = getLogger('ocrd.%s.__init__' % ocrd_tool['executable']) logger.error("Failed to resolve %s for processor %s" % (show_resource, ocrd_tool['executable'])) else: fpath = Path(res_fname[0]) if fpath.is_dir(): with pushd_popd(fpath): fileobj = io.BytesIO() with tarfile.open(fileobj=fileobj, mode='w:gz') as tarball: tarball.add('.') fileobj.seek(0) copyfileobj(fileobj, sys.stdout.buffer) else: sys.stdout.buffer.write(fpath.read_bytes()) return self.ocrd_tool = ocrd_tool if show_help: self.show_help() return self.version = version if show_version: self.show_version() return self.workspace = workspace # FIXME HACK would be better to use pushd_popd(self.workspace.directory) # but there is no way to do that in process here since it's an # overridden method. chdir is almost always an anti-pattern. if self.workspace: self.old_pwd = getcwd() os.chdir(self.workspace.directory) self.input_file_grp = input_file_grp self.output_file_grp = output_file_grp self.page_id = None if page_id == [] or page_id is None else page_id parameterValidator = ParameterValidator(ocrd_tool) report = parameterValidator.validate(parameter) if not report.is_valid: raise Exception("Invalid parameters %s" % report.errors) self.parameter = parameter
def validate_cli(): """ All the validation in one CLI """ initLogging()