Ejemplo n.º 1
0
def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
    """
    Compare the PAGE/ALTO/text document GT against the document OCR.

    dinglehopper detects if GT/OCR are ALTO or PAGE XML documents to extract
    their text and falls back to plain text if no ALTO or PAGE is detected.

    The files GT and OCR are usually a ground truth document and the result of
    an OCR software, but you may use dinglehopper to compare two OCR results. In
    that case, use --no-metrics to disable the then meaningless metrics and also
    change the color scheme from green/red to blue.

    The comparison report will be written to $REPORT_PREFIX.{html,json}, where
    $REPORT_PREFIX defaults to "report". The reports include the character error
    rate (CER) and the word error rate (WER).

    By default, the text of PAGE files is extracted on 'region' level. You may
    use "--textequiv-level line" to extract from the level of TextLine tags.
    """
    initLogging()
    Config.progress = progress
    process(gt,
            ocr,
            report_prefix,
            metrics=metrics,
            textequiv_level=textequiv_level)
Ejemplo n.º 2
0
def test_handle_response_for_invalid_content(mock_get, response_dir):
    """If invalid content is returned, store warning log entry"""

    # arrange
    url = 'http://digital.bibliothek.uni-halle.de/hd/oai/?verb=GetRecord&metadataPrefix=mets&mode=xml&identifier=foo'
    mock_get.return_value.status_code = 200
    mock_get.return_value.content = b'foo bar'
    headers = {'Content-Type': 'text/plain'}
    mock_get.return_value.headers = headers
    resolver = Resolver()
    initLogging()

    # capture log
    log = getLogger('ocrd_models.utils.handle_oai_response')
    capt = FIFOIO(256)
    sh = StreamHandler(capt)
    sh.setFormatter(Formatter(LOG_FORMAT))
    log.addHandler(sh)

    # act
    resolver.download_to_directory(response_dir, url)

    # assert
    mock_get.assert_called_once_with(url)
    log_output = capt.getvalue()
    assert 'WARNING ocrd_models.utils.handle_oai_response' in log_output
Ejemplo n.º 3
0
    def test_logging_really_non_duplicate(self):
        initLogging()
        child_logger = getLogger('a.b')
        print(child_logger)
        parent_logger = getLogger('a')
        root_logger = getLogger('')
        self.assertFalse(root_logger.propagate, 'root logger should not propagate')
        self.assertTrue(parent_logger.propagate, 'parent has no handler => do propagate')
        self.assertTrue(child_logger.propagate, 'child no handler => do propagate')

        root_capture = FIFOIO(256)
        root_handler = logging.StreamHandler(root_capture)
        root_handler.setFormatter(logging.Formatter(fmt=LOG_FORMAT, datefmt=LOG_TIMEFMT))
        root_logger.addHandler(root_handler)

        parent_capture = FIFOIO(256)
        parent_handler = logging.StreamHandler(parent_capture)
        parent_handler.setFormatter(logging.Formatter(fmt=LOG_FORMAT, datefmt=LOG_TIMEFMT))
        parent_logger.addHandler(parent_handler)

        # parent_logger = getLogger('a')
        # self.assertFalse(parent_logger.propagate, 'parent has handler now => do not propagate')

        self.assertTrue(child_logger.propagate, 'child has still no handler => do propagate')

        child_logger.error('test')

        root_str = root_capture.getvalue()
        parent_str = parent_capture.getvalue()
        print('root_str=%s' % root_str)
        print('parent_str=%s' % parent_str)

        self.assertEqual(root_str.count('\n'), 0)
        self.assertEqual(parent_str.count('\n'), 1)
Ejemplo n.º 4
0
    def test_tmpConfigfile(self):
        self.assertNotEqual(
            logging.getLogger('').getEffectiveLevel(), logging.NOTSET)
        with TemporaryDirectory() as tempdir:
            with pushd_popd(tempdir):
                with open('ocrd_logging.conf', 'w') as f:
                    # write logging configuration file (MWE)
                    f.write('''
                        [loggers]
                        keys=root

                        [handlers]
                        keys=consoleHandler

                        [formatters]
                        keys=

                        [logger_root]
                        level=ERROR
                        handlers=consoleHandler

                        [handler_consoleHandler]
                        class=StreamHandler
                        formatter=
                        args=(sys.stdout,)
                        ''')
                # this will call logging.config.fileConfig with disable_existing_loggers=True,
                # so the defaults from the import-time initLogging should be invalided
                initLogging()
                # ensure log level is set from temporary config file
                self.assertEqual(
                    logging.getLogger('').getEffectiveLevel(), logging.ERROR)
Ejemplo n.º 5
0
def main() -> None:
    from ocrd_utils import initLogging
    initLogging()
    from ocrd_browser.application import OcrdBrowserApplication
    install_excepthook()
    app = OcrdBrowserApplication()
    app.run(sys.argv)
Ejemplo n.º 6
0
def bashlib_input_files(**kwargs):
    """
    List input files for processing

    Instantiate a processor and workspace from the given processing options.
    Then loop through the input files of the input fileGrp, and for each one,
    print its `url`, `ID`, `mimetype` and `pageId`, as well as its recommended
    `outputFileId` (from ``make_file_id``).

    (The printing format is one associative array initializer per line.)
    """
    initLogging()
    mets = kwargs.pop('mets')
    working_dir = kwargs.pop('working_dir')
    if is_local_filename(mets) and not isfile(get_local_filename(mets)):
        msg = "File does not exist: %s" % mets
        raise Exception(msg)
    resolver = Resolver()
    workspace = resolver.workspace_from_url(mets, working_dir)
    processor = Processor(workspace,
                          ocrd_tool=None,
                          page_id=kwargs['page_id'],
                          input_file_grp=kwargs['input_file_grp'],
                          output_file_grp=kwargs['output_file_grp'])
    for input_file in processor.input_files:
        for field in ['url', 'ID', 'mimetype', 'pageId']:
            # make this bash-friendly (show initialization for associative array)
            print("[%s]='%s'" % (field, getattr(input_file, field)), end=' ')
        print("[outputFileId]='%s'" %
              make_file_id(input_file, kwargs['output_file_grp']))
Ejemplo n.º 7
0
def process_cli(log_level, mets, page_id, tasks, overwrite):
    """
    Process a series of tasks
    """
    initLogging()
    log = getLogger('ocrd.cli.process')
    run_tasks(mets, log_level, page_id, tasks, overwrite)
    log.info("Finished")
Ejemplo n.º 8
0
 def test_multiple_initLogging(self):
     disableLogging()
     initLogging()
     self.capture_out_err()
     initLogging()
     out = '\n'.join(self.capture_out_err())
     assert 'initLogging was called multiple times' in out
     assert __file__ in out
Ejemplo n.º 9
0
def workspace_cli(ctx, directory, mets, mets_basename, backup):
    """
    Working with workspace
    """
    initLogging()
    ctx.obj = WorkspaceCtx(directory,
                           mets_url=mets,
                           mets_basename=mets_basename,
                           automatic_backup=backup)
Ejemplo n.º 10
0
 def test_setOverrideLogLevel(self):
     initLogging()
     rootLogger = logging.getLogger('')
     somelogger = getLogger('foo.bar')
     somelogger.setLevel(getLevelName('ERROR'))
     setOverrideLogLevel('ERROR')
     self.assertEqual(rootLogger.getEffectiveLevel(), logging.ERROR)
     self.assertEqual(somelogger.getEffectiveLevel(), logging.ERROR)
     notherlogger = getLogger('bar.foo')
     self.assertEqual(notherlogger.getEffectiveLevel(), logging.ERROR)
     setOverrideLogLevel('INFO')
     somelogger = getLogger('foo.bar')
Ejemplo n.º 11
0
def main(input_file, textequiv_level):
    """
    Extract the text of the given INPUT_FILE.

    dinglehopper detects if INPUT_FILE is an ALTO or PAGE XML document to extract
    its text and falls back to plain text if no ALTO or PAGE is detected.

    By default, the text of PAGE files is extracted on 'region' level. You may
    use "--textequiv-level line" to extract from the level of TextLine tags.
    """
    initLogging()
    input_text = extract(input_file, textequiv_level=textequiv_level).text
    print(input_text)
Ejemplo n.º 12
0
    def test_logging_non_duplicate(self):
        """
        Verify that child loggers don't propagate a log message they handle
        """
        initLogging()

        root_logger = logging.getLogger('')
        self.assertTrue(root_logger.handlers, 'root logger has at least 1 handler')

        parent_capture = FIFOIO(256)
        parent_handler = logging.StreamHandler(parent_capture)
        parent_handler.setFormatter(logging.Formatter(fmt=LOG_FORMAT, datefmt=LOG_TIMEFMT))
        parent_logger = getLogger('foo')
        self.assertTrue(parent_logger.propagate, 'no handler on logger => propagate')
        parent_logger.addHandler(parent_handler)
        parent_logger = getLogger('foo')
        self.assertFalse(parent_logger.propagate, 'should not propagate because StreamHandler has been attached')

        child_logger = getLogger('foo.bar')
        self.assertTrue(child_logger.propagate, 'no handler on logger => propagate')
        child_logger.setLevel('DEBUG')

        child_logger.error('first error')

        child_capture = FIFOIO(256)
        child_handler = logging.StreamHandler(child_capture)
        child_handler.setFormatter(logging.Formatter(fmt=LOG_FORMAT, datefmt=LOG_TIMEFMT))

        child_logger.debug('first debug')

        child_logger.addHandler(child_handler)
        child_logger = getLogger('foo.bar')
        self.assertFalse(child_logger.propagate, 'should not propagate because StreamHandler has been attached')

        child_logger.debug('second debug')
        child_logger.error('second error')

        parent_output = parent_capture.getvalue()
        parent_capture.close()
        child_output = child_capture.getvalue()
        child_capture.close()
        # print('parent', parent_output)
        # print('child', child_output)

        self.assertTrue(match(TIMEFMT_RE + 'ERROR foo.bar - first error\n', parent_output),
                'parent received first error but not second error nor first debug')
        self.assertTrue(match("\n".join([
            TIMEFMT_RE + 'DEBUG foo.bar - second debug',
            TIMEFMT_RE + 'ERROR foo.bar - second error',
            ]), child_output),
                'child received second error and debug but not first error and debug')
Ejemplo n.º 13
0
    def testProcessorProfiling(self):
        initLogging()
        log_capture_string = FIFOIO(256)
        ch = logging.StreamHandler(log_capture_string)
        ch.setFormatter(logging.Formatter(LOG_FORMAT))
        getLogger('ocrd.process.profile').setLevel('DEBUG')
        getLogger('ocrd.process.profile').addHandler(ch)

        run_processor(DummyProcessor, resolver=Resolver(), mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'))

        log_contents = log_capture_string.getvalue()
        log_capture_string.close()
        # with open('/tmp/debug.log', 'w') as f:
        #     f.write(log_contents)
        # Check whether profile information has been logged. Dummy should finish in under 0.1s
        self.assertTrue(match(r'.*Executing processor \'ocrd-test\' took 0.\d+s.*', log_contents))
Ejemplo n.º 14
0
def test_handle_common_oai_response(mock_get, response_dir, oai_response_content):
    """Base use case with valid OAI Response data"""
    initLogging()

    # arrange
    url = 'http://digital.bibliothek.uni-halle.de/hd/oai/?verb=GetRecord&metadataPrefix=mets&mode=xml&identifier=9049'
    mock_get.return_value.status_code = 200
    mock_get.return_value.content = oai_response_content
    headers = {'Content-Type': 'text/xml'}
    mock_get.return_value.headers = headers
    resolver = Resolver()

    # act
    result = resolver.download_to_directory(response_dir, url)

    # assert
    mock_get.assert_called_once_with(url)
    assert result == 'oai'
Ejemplo n.º 15
0
def main(log_level, alto_version, check_words, check_border, skip_empty_lines,
         trailing_dash_to_hyp, dummy_textline, dummy_word, textequiv_index,
         textequiv_fallback_strategy, region_order, output_file, filename):
    """
    Convert PAGE to ALTO
    """
    initLogging()
    converter = OcrdPageAltoConverter(
        alto_version=alto_version,
        page_filename=filename,
        check_words=check_words,
        check_border=check_border,
        skip_empty_lines=skip_empty_lines,
        trailing_dash_to_hyp=trailing_dash_to_hyp,
        dummy_textline=dummy_textline,
        dummy_word=dummy_word,
        textequiv_index=textequiv_index,
        textequiv_fallback_strategy=textequiv_fallback_strategy,
        region_order=region_order)
    converter.convert()
    with open(1 if output_file == '-' else output_file, 'w') as output:
        output.write(str(converter))
Ejemplo n.º 16
0
def main(gt, ocr, report_prefix, metrics):
    """
    Compare the GT line text directory against the OCR line text directory.

    This assumes that the GT line text directory contains textfiles with a common
    suffix like ".gt.txt", and the OCR line text directory contains textfiles with
    a common suffix like ".some-ocr.txt". The text files also need to be paired,
    i.e. the GT file "line001.gt.txt" needs to match a file "line001.some-ocr.txt"
    in the OCT lines directory.

    The GT and OCR directories are usually round truth line texts and the results of
    an OCR software, but you may use dinglehopper to compare two OCR results. In
    that case, use --no-metrics to disable the then meaningless metrics and also
    change the color scheme from green/red to blue.

    The comparison report will be written to $REPORT_PREFIX.{html,json}, where
    $REPORT_PREFIX defaults to "report". The reports include the character error
    rate (CER) and the word error rate (WER).

    """
    initLogging()
    process(gt, ocr, report_prefix, metrics=metrics)
Ejemplo n.º 17
0
def main(image, out, model, save_images, save_layout, save_deskewed, save_all,
         enable_plotting, allow_enhancement, curved_line, full_layout, tables,
         input_binary, allow_scaling, headers_off, log_level):
    if log_level:
        setOverrideLogLevel(log_level)
    initLogging()
    if not enable_plotting and (save_layout or save_deskewed or save_all
                                or save_images or allow_enhancement):
        print(
            "Error: You used one of -sl, -sd, -sa, -si or -ae but did not enable plotting with -ep"
        )
        sys.exit(1)
    elif enable_plotting and not (save_layout or save_deskewed or save_all
                                  or save_images or allow_enhancement):
        print(
            "Error: You used -ep to enable plotting but set none of -sl, -sd, -sa, -si or -ae"
        )
        sys.exit(1)
    eynollah = Eynollah(
        image_filename=image,
        dir_out=out,
        dir_models=model,
        dir_of_cropped_images=save_images,
        dir_of_layout=save_layout,
        dir_of_deskewed=save_deskewed,
        dir_of_all=save_all,
        enable_plotting=enable_plotting,
        allow_enhancement=allow_enhancement,
        curved_line=curved_line,
        full_layout=full_layout,
        tables=tables,
        input_binary=input_binary,
        allow_scaling=allow_scaling,
        headers_off=headers_off,
    )
    pcgts = eynollah.run()
    eynollah.writer.write_pagexml(pcgts)
Ejemplo n.º 18
0
Archivo: log.py Proyecto: cclauss/core
def log_cli(ctx, name, *args, **kwargs):
    """
    Logging
    """
    initLogging()
    ctx.obj = LogCtx(name)
Ejemplo n.º 19
0
def ocrd_cli_wrap_processor(
        processorClass,
        ocrd_tool=None,
        mets=None,
        working_dir=None,
        dump_json=False,
        help=False,  # pylint: disable=redefined-builtin
        version=False,
        overwrite=False,
        **kwargs):
    if not sys.argv[1:]:
        processorClass(workspace=None, show_help=True)
        sys.exit(1)
    if dump_json or help or version:
        processorClass(workspace=None,
                       dump_json=dump_json,
                       show_help=help,
                       show_version=version)
        sys.exit()
    else:
        initLogging()
        LOG = getLogger('ocrd_cli_wrap_processor')
        # LOG.info('kwargs=%s' % kwargs)
        # Merge parameter overrides and parameters
        if 'parameter_override' in kwargs:
            set_json_key_value_overrides(kwargs['parameter'],
                                         *kwargs['parameter_override'])
        # TODO OCR-D/core#274
        # Assert -I / -O
        # if not kwargs['input_file_grp']:
        #     raise ValueError('-I/--input-file-grp is required')
        # if not kwargs['output_file_grp']:
        #     raise ValueError('-O/--output-file-grp is required')
        if is_local_filename(mets) and not isfile(get_local_filename(mets)):
            msg = "File does not exist: %s" % mets
            LOG.error(msg)
            raise Exception(msg)
        resolver = Resolver()
        workspace = resolver.workspace_from_url(mets, working_dir)
        page_id = kwargs.get('page_id')
        # XXX not possible while processors do not adhere to # https://github.com/OCR-D/core/issues/505
        # if overwrite
        #     if 'output_file_grp' not in kwargs or not kwargs['output_file_grp']:
        #         raise Exception("--overwrite requires --output-file-grp")
        #     LOG.info("Removing files because of --overwrite")
        #     for grp in kwargs['output_file_grp'].split(','):
        #         if page_id:
        #             for one_page_id in kwargs['page_id'].split(','):
        #                 LOG.debug("Removing files in output file group %s with page ID %s", grp, one_page_id)
        #                 for file in workspace.mets.find_files(pageId=one_page_id, fileGrp=grp):
        #                     workspace.remove_file(file, force=True, keep_file=False, page_recursive=True)
        #         else:
        #             LOG.debug("Removing all files in output file group %s ", grp)
        #             # TODO: can be reduced to `page_same_group=True` as soon as core#505 has landed (in all processors)
        #             workspace.remove_file_group(grp, recursive=True, force=True, keep_files=False, page_recursive=True, page_same_group=False)
        #     workspace.save_mets()
        # XXX While https://github.com/OCR-D/core/issues/505 is open, set 'overwrite_mode' globally on the workspace
        if overwrite:
            workspace.overwrite_mode = True
        report = WorkspaceValidator.check_file_grp(
            workspace, kwargs['input_file_grp'],
            '' if overwrite else kwargs['output_file_grp'], page_id)
        if not report.is_valid:
            raise Exception("Invalid input/output file grps:\n\t%s" %
                            '\n\t'.join(report.errors))
        run_processor(processorClass,
                      ocrd_tool,
                      mets,
                      workspace=workspace,
                      **kwargs)
Ejemplo n.º 20
0
Archivo: zip.py Proyecto: cclauss/core
def zip_cli():
    """
    Bag/Spill/Validate OCRD-ZIP bags
    """
    initLogging()
Ejemplo n.º 21
0
def cli(output_file, normalization, gt_level, confusion, file_lists, gt_file,
        ocr_files):
    """Compare text lines by aligning and computing the textual distance and character error rate.
    
    This compares 1:n given PAGE-XML or plain text files.
    
    If `--file-lists` is given and files are plain text,
    then they will be interpreted as (newline-separated)
    lists of path names for single-line text files (for
    Ocropus convention).
    
    Writes a JSON report file to `--output-file`.
    (No error aggregation across files in this CLI.)
    """
    initLogging()
    LOG = logging.getLogger(__name__)
    LOG.setLevel(logging.INFO)

    aligners = [
        Alignment(logger=LOG, confusion=bool(confusion)) for _ in ocr_files
    ]
    edits = [Edits(logger=LOG) for _ in ocr_files]
    LOG.info("processing '%s'", gt_file)
    gt_lines = get_lines(gt_file, file_lists)
    if not gt_lines:
        LOG.critical("file '%s' contains no text lines to compare", gt_file)
        exit(1)
    report = dict()
    for i, ocr_file in enumerate(ocr_files):
        LOG.info("processing '%s'", ocr_file)
        ocr_lines = get_lines(ocr_file, file_lists)
        if not ocr_lines:
            LOG.error("file '%s' contains no text lines to compare", ocr_file)
            continue
        pair = gt_file + ',' + ocr_file
        if isinstance(ocr_lines, dict):
            # from PAGE-XML file
            line_ids = ocr_lines.keys()
        else:
            # from plain text file
            line_ids = range(len(ocr_lines))
        for line_id in line_ids:
            lines = report.setdefault(pair, dict()).setdefault('lines', list())
            if isinstance(gt_lines, dict):
                has_line = line_id in gt_lines
            else:
                has_line = line_id < len(gt_lines)
            if not has_line:
                LOG.error(
                    "line '%s' in file '%s' is missing from GT file '%s'",
                    str(line_id), ocr_file, gt_file)
                lines.append({line_id: 'missing'})
                continue
            gt_line = gt_lines[line_id]
            ocr_line = ocr_lines[line_id]
            gt_len = len(gt_line)
            ocr_len = len(ocr_line)
            if 0.2 * (gt_len + ocr_len) < math.fabs(gt_len - ocr_len) > 5:
                LOG.warning(
                    'line "%s" in file "%s" deviates significantly in length (%d vs %d)',
                    str(line_id), ocr_file, gt_len, ocr_len)
            if normalization == 'Levenshtein-fast':
                # not exact (but fast): codepoints
                dist = aligners[i].get_levenshtein_distance(ocr_line, gt_line)
            else:
                # exact (but slow): grapheme clusters
                dist = aligners[i].get_adjusted_distance(
                    ocr_line,
                    gt_line,
                    # Levenshtein / NFC / NFKC / historic_latin
                    normalization=normalization)
            edits[i].add(dist)
            lines.append({line_id: {'length': gt_len, 'distance': dist}})
        # report results
        LOG.info("%5d lines %.3f±%.3f CER %s vs %s", edits[i].length,
                 edits[i].mean, math.sqrt(edits[i].varia), ocr_file, gt_file)
        report[pair]['length'] = edits[i].length
        report[pair]['distance-mean'] = edits[i].mean
        report[pair]['distance-varia'] = edits[i].varia
        if confusion:
            if not edits[i].length:
                continue
            conf = aligners[i].get_confusion(confusion)
            LOG.info("most frequent confusion / %s vs %s: %s", gt_file,
                     ocr_file, conf)
            report[pair]['confusion'] = repr(conf)
    if output_file == '-':
        output = sys.stdout
    else:
        output = open(output_file, 'w')
    json.dump(report, output, indent=2)
Ejemplo n.º 22
0
 def setUp(self):
     initLogging()
Ejemplo n.º 23
0
 def tearDown(self):
     initLogging()
Ejemplo n.º 24
0
 def setUp(self):
     initLogging()
     self.resolver = Resolver()
Ejemplo n.º 25
0
 def setUp(self):
     self.maxDiff = None
     self.resolver = Resolver()
     initLogging()
     self.runner = CliRunner()
Ejemplo n.º 26
0
 def setUp(self):
     disableLogging()
     initLogging()
     self.resolver = Resolver()
     self.workspace = self.resolver.workspace_from_url(
         assets.url_of('SBB0000F29300010000/data/mets.xml'))
Ejemplo n.º 27
0
 def setUp(self):
     self.mets = OcrdMets(
         filename=assets.url_of('SBB0000F29300010000/data/mets.xml'))
     initLogging()
Ejemplo n.º 28
0
__all__ = ['tf']

import os
import warnings
from ocrd_utils import initLogging, getLogger
initLogging()
getLogger('tensorflow').setLevel('ERROR')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # No prints from the tensorflow side
warnings.filterwarnings('ignore', category=FutureWarning)
#import tensorflow as tf
import tensorflow.compat.v1 as tf
import tensorflow.keras as keras
tf.disable_v2_behavior()
Ejemplo n.º 29
0
    def __init__(
            self,
            workspace,
            ocrd_tool=None,
            parameter=None,
            # TODO OCR-D/core#274
            # input_file_grp=None,
            # output_file_grp=None,
            input_file_grp="INPUT",
            output_file_grp="OUTPUT",
            page_id=None,
            show_resource=None,
            list_resources=False,
            show_help=False,
            show_version=False,
            dump_json=False,
            version=None):
        """
        Instantiate, but do not process. Unless ``list_resources`` or
        ``show_resource`` or ``show_help`` or ``show_version`` or
        ``dump_json`` is true, setup for processing (parsing and
        validating parameters, entering the workspace directory).

        Args:
             workspace (:py:class:`~ocrd.Workspace`): The workspace to process. \
                 Can be ``None`` even for processing (esp. on multiple workspaces), \
                 but then needs to be set before running.
        Keyword Args:
             ocrd_tool (string): JSON of the ocrd-tool description for that processor. \
                 Can be ``None`` for processing, but needs to be set before running.
             parameter (string): JSON of the runtime choices for ocrd-tool ``parameters``. \
                 Can be ``None`` even for processing, but then needs to be set before running.
             input_file_grp (string): comma-separated list of METS ``fileGrp``s used for input.
             output_file_grp (string): comma-separated list of METS ``fileGrp``s used for output.
             page_id (string): comma-separated list of METS physical ``page`` IDs to process \
                 (or empty for all pages).
             show_resource (string): If not ``None``, then instead of processing, resolve \
                 given resource by name and print its contents to stdout.
             list_resources (boolean): If true, then instead of processing, find all installed \
                 resource files in the search paths and print their path names.
             show_help (boolean): If true, then instead of processing, print a usage description \
                 including the standard CLI and all of this processor's ocrd-tool parameters and \
                 docstrings.
             show_version (boolean): If true, then instead of processing, print information on \
                 this processor's version and OCR-D version. Exit afterwards.
             dump_json (boolean): If true, then instead of processing, print :py:attr:`ocrd_tool` \
                 on stdout.
        """
        if parameter is None:
            parameter = {}
        if dump_json:
            print(json.dumps(ocrd_tool, indent=True))
            return
        if list_resources:
            has_dirs, has_files = get_processor_resource_types(None, ocrd_tool)
            for res in list_all_resources(ocrd_tool['executable']):
                if Path(res).is_dir() and not has_dirs:
                    continue
                if not Path(res).is_dir() and not has_files:
                    continue
                print(res)
            return
        if show_resource:
            has_dirs, has_files = get_processor_resource_types(None, ocrd_tool)
            res_fname = list_resource_candidates(ocrd_tool['executable'],
                                                 show_resource)
            if not res_fname:
                initLogging()
                logger = getLogger('ocrd.%s.__init__' %
                                   ocrd_tool['executable'])
                logger.error("Failed to resolve %s for processor %s" %
                             (show_resource, ocrd_tool['executable']))
            else:
                fpath = Path(res_fname[0])
                if fpath.is_dir():
                    with pushd_popd(fpath):
                        fileobj = io.BytesIO()
                        with tarfile.open(fileobj=fileobj,
                                          mode='w:gz') as tarball:
                            tarball.add('.')
                        fileobj.seek(0)
                        copyfileobj(fileobj, sys.stdout.buffer)
                else:
                    sys.stdout.buffer.write(fpath.read_bytes())
            return
        self.ocrd_tool = ocrd_tool
        if show_help:
            self.show_help()
            return
        self.version = version
        if show_version:
            self.show_version()
            return
        self.workspace = workspace
        # FIXME HACK would be better to use pushd_popd(self.workspace.directory)
        # but there is no way to do that in process here since it's an
        # overridden method. chdir is almost always an anti-pattern.
        if self.workspace:
            self.old_pwd = getcwd()
            os.chdir(self.workspace.directory)
        self.input_file_grp = input_file_grp
        self.output_file_grp = output_file_grp
        self.page_id = None if page_id == [] or page_id is None else page_id
        parameterValidator = ParameterValidator(ocrd_tool)
        report = parameterValidator.validate(parameter)
        if not report.is_valid:
            raise Exception("Invalid parameters %s" % report.errors)
        self.parameter = parameter
Ejemplo n.º 30
0
def validate_cli():
    """
    All the validation in one CLI
    """
    initLogging()