コード例 #1
0
    def testProcessorProfiling(self):
        initLogging()
        log_capture_string = FIFOIO(256)
        ch = logging.StreamHandler(log_capture_string)
        ch.setFormatter(logging.Formatter(LOG_FORMAT))
        getLogger('ocrd.process.profile').setLevel('DEBUG')
        getLogger('ocrd.process.profile').addHandler(ch)

        run_processor(
            DummyProcessor,
            resolver=Resolver(),
            mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'))

        log_contents = log_capture_string.getvalue()
        log_capture_string.close()
        # with open('/tmp/debug.log', 'w') as f:
        #     f.write(log_contents)
        # Check whether profile information has been logged. Dummy should finish in under 0.1s
        self.assertTrue(
            match(r'.*Executing processor \'ocrd-test\' took 0.\d+s.*',
                  log_contents))
コード例 #2
0
ファイル: decode.py プロジェクト: ASVLeipzig/cor-asv-fst
def _page_get_lines(pcgts):
    LOG = getLogger('processor.FSTCorrection')
    lines = []
    n_regions = pcgts.get_Page().get_AllRegions(classes=['Text'], order='reading-order')
    if not n_regions:
        LOG.warning('Page contains no text regions')
    for n_region in n_regions:
        n_lines = n_region.get_TextLine()
        if not n_lines:
            LOG.warning("Region '%s' contains no text lines", n_region.id)
        lines.extend(n_lines)
    return lines
コード例 #3
0
 def _validate_page_xsd(self):
     """
     Validate all PAGE-XML files against PAGE XSD schema
     """
     log = getLogger('ocrd.workspace_validator')
     log.debug("Validating all PAGE-XML files against XSD")
     for ocrd_file in self.mets.find_files(mimetype=MIMETYPE_PAGE):
         self.workspace.download_file(ocrd_file)
         for err in XsdPageValidator.validate(Path(
                 ocrd_file.local_filename)).errors:
             self.report.add_error("%s: %s" % (ocrd_file.ID, err))
     log.debug("Finished alidating all PAGE-XML files against XSD")
コード例 #4
0
def _fix_segment(segment, page_id, reverse=False):
    """Fix order of child elements of (region/line/word) segment."""
    LOG = getLogger('processor.RepairInconsistencies')

    if isinstance(segment, TextRegionType):
        joiner = '\n'
        sort_horizontal = False
        children = segment.get_TextLine()
        adoption = segment.set_TextLine
    elif isinstance(segment, TextLineType):
        joiner = ' '
        sort_horizontal = True
        children = segment.get_Word()
        adoption = segment.set_Word
    elif isinstance(segment, WordType):
        joiner = ''
        sort_horizontal = True
        children = segment.get_Glyph()
        adoption = segment.set_Glyph
    else:
        raise Exception('invalid element type %s of segment to fix' %
                        type(segment))
    if not children:
        return
    segment_text = get_text(segment)
    concat_text = get_text(children, joiner)
    if (segment_text and concat_text and segment_text != concat_text
            and segment_text.replace(joiner, '') != concat_text.replace(
                joiner, '')):

        def polygon_position(child, horizontal=sort_horizontal):
            polygon = Polygon(polygon_from_points(child.get_Coords().points))
            if horizontal:
                return polygon.centroid.x
            return polygon.centroid.y

        sorted_children = sorted(children,
                                 reverse=reverse,
                                 key=polygon_position)
        sorted_concat_text = get_text(sorted_children, joiner)

        if (segment_text == sorted_concat_text or segment_text.replace(
                joiner, '') == sorted_concat_text.replace(joiner, '')):
            LOG.info('Fixing element order of page "%s" segment "%s"', page_id,
                     segment.id)
            adoption(sorted_children)
        else:
            LOG.debug(
                'Resorting children of page "%s" segment "%s" from %s to %s' +
                'does not suffice to turn "%s" into "%s"', page_id, segment.id,
                str([seg.id for seg in children]),
                str([seg.id for seg in sorted_children]), concat_text,
                segment_text)
コード例 #5
0
 def _copy_impl(self, src_filename, filename, progress_cb=None):
     log = getLogger('ocrd.resource_manager._copy_impl')
     log.info("Copying %s" % src_filename)
     with open(filename, 'wb') as f_out, open(src_filename, 'rb') as f_in:
         while True:
             chunk = f_in.read(4096)
             if chunk:
                 f_out.write(chunk)
                 if progress_cb:
                     progress_cb(len(chunk))
             else:
                 break
コード例 #6
0
    def test_logging_really_non_duplicate(self):
        initLogging()
        child_logger = getLogger('a.b')
        print(child_logger)
        parent_logger = getLogger('a')
        root_logger = getLogger('')
        self.assertFalse(root_logger.propagate,
                         'root logger should not propagate')
        self.assertTrue(parent_logger.propagate,
                        'parent has no handler => do propagate')
        self.assertTrue(child_logger.propagate,
                        'child no handler => do propagate')

        root_capture = FIFOIO(256)
        root_handler = logging.StreamHandler(root_capture)
        root_handler.setFormatter(
            logging.Formatter(fmt=LOG_FORMAT, datefmt=LOG_TIMEFMT))
        root_logger.addHandler(root_handler)

        parent_capture = FIFOIO(256)
        parent_handler = logging.StreamHandler(parent_capture)
        parent_handler.setFormatter(
            logging.Formatter(fmt=LOG_FORMAT, datefmt=LOG_TIMEFMT))
        parent_logger.addHandler(parent_handler)

        # parent_logger = getLogger('a')
        # self.assertFalse(parent_logger.propagate, 'parent has handler now => do not propagate')

        self.assertTrue(child_logger.propagate,
                        'child has still no handler => do propagate')

        child_logger.error('test')

        root_str = root_capture.getvalue()
        parent_str = parent_capture.getvalue()
        print('root_str=%s' % root_str)
        print('parent_str=%s' % parent_str)

        self.assertEqual(root_str.count('\n'), 0)
        self.assertEqual(parent_str.count('\n'), 1)
コード例 #7
0
ファイル: resource_manager.py プロジェクト: bertsky/core
 def download(
     self,
     executable,
     url,
     basedir,
     overwrite=False,
     no_subdir=False,
     name=None,
     resource_type='file',
     path_in_archive='.',
     progress_cb=None,
     size=None,
 ):
     """
     Download a resource by URL
     """
     log = getLogger('ocrd.resource_manager.download')
     destdir = Path(basedir) if no_subdir else Path(basedir, executable)
     if not name:
         url_parsed = urlparse(url)
         name = Path(unquote(url_parsed.path)).name
     fpath = Path(destdir, name)
     is_url = url.startswith('https://') or url.startswith('http://')
     if fpath.exists() and not overwrite:
         log.info(
             "%s to be %s to %s which already exists and overwrite is False"
             % (url, 'downloaded' if is_url else 'copied', fpath))
         return fpath
     destdir.mkdir(parents=True, exist_ok=True)
     if resource_type == 'file':
         if is_url:
             self._download_impl(url, fpath, progress_cb)
         else:
             self._copy_impl(url, fpath, progress_cb)
     elif resource_type == 'tarball':
         with pushd_popd(tempdir=True):
             if is_url:
                 self._download_impl(url, 'download.tar.xx', progress_cb,
                                     size)
             else:
                 self._copy_impl(url, 'download.tar.xx', progress_cb)
             Path('out').mkdir()
             with pushd_popd('out'):
                 log.info("Extracting tarball")
                 with open_tarfile('../download.tar.xx', 'r:*') as tar:
                     tar.extractall()
                 log.info("Copying '%s' from tarball to %s" %
                          (path_in_archive, fpath))
                 copytree(path_in_archive, str(fpath))
     # TODO
     # elif resource_type == 'github-dir':
     return fpath
コード例 #8
0
    def spill(self, src, dest):
        """
        Spill a workspace, i.e. unpack it and turn it into a workspace.

        See https://ocr-d.github.com/ocrd_zip#unpacking-ocrd-zip-to-a-workspace

        Arguments:
            src (string): Path to OCRD-ZIP
            dest (string): Path to directory to unpack data folder to
        """
        log = getLogger('ocrd.workspace_bagger')

        if exists(dest) and not isdir(dest):
            raise Exception("Not a directory: %s" % dest)

        # If dest is an existing directory, try to derive its name from src
        if isdir(dest):
            workspace_name = re.sub(r'(\.ocrd)?\.zip$', '', basename(src))
            new_dest = join(dest, workspace_name)
            if exists(new_dest):
                raise Exception("Directory exists: %s" % new_dest)
            dest = new_dest

        log.info("Spilling %s to %s", src, dest)

        bagdir = mkdtemp(prefix=TMP_BAGIT_PREFIX)
        unzip_file_to_dir(src, bagdir)

        datadir = join(bagdir, 'data')
        for root, _, files in walk(datadir):
            for f in files:
                srcfile = join(root, f)
                destdir = join(dest, relpath(root, datadir))
                destfile = join(destdir, f)
                if not exists(destdir):
                    makedirs(destdir)
                log.debug("Copy %s -> %s", srcfile, destfile)
                copyfile(srcfile, destfile)

        # TODO copy allowed tag files if present

        # TODO validate bagit

        # Drop tempdir
        rmtree(bagdir)

        # Create workspace
        workspace = Workspace(self.resolver, directory=dest)

        # TODO validate workspace

        return workspace
コード例 #9
0
    def process(self):
        """Segment pages into regions using a Mask R-CNN model."""
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        LOG = getLogger('processor.AnybaseocrBlockSegmenter')
        if not tf.test.is_gpu_available():
            LOG.warning(
                "Tensorflow cannot detect CUDA installation. Running without GPU will be slow."
            )

        for input_file in self.input_files:
            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page = pcgts.get_Page()
            page_id = input_file.pageId or input_file.ID

            # todo rs: why not cropped?
            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page,
                page_id,
                feature_filter='binarized,deskewed,cropped,clipped,non_text')
            # try to load pixel masks
            try:
                # todo rs: this combination only works for tiseg with use_deeplr=true
                mask_image, _, _ = self.workspace.image_from_page(
                    page,
                    page_id,
                    feature_selector='clipped',
                    feature_filter='binarized,deskewed,cropped,non_text')
            except:
                mask_image = None
            if page_image_info.resolution != 1:
                dpi = page_image_info.resolution
                if page_image_info.resolutionUnit == 'cm':
                    dpi = round(dpi * 2.54)
            else:
                dpi = None

            self._process_segment(page_image, page, page_xywh, page_id,
                                  input_file, mask_image, dpi)

            file_id = make_file_id(input_file, self.output_file_grp)
            pcgts.set_pcGtsId(file_id)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts).encode('utf-8'))
コード例 #10
0
    def process(self):
        assert_file_grp_cardinality(self.input_file_grp, 2, "GT and OCR")
        assert_file_grp_cardinality(self.output_file_grp, 1)

        log = getLogger("processor.OcrdDinglehopperEvaluate")

        metrics = self.parameter["metrics"]
        textequiv_level = self.parameter["textequiv_level"]
        gt_grp, ocr_grp = self.input_file_grp.split(",")

        input_file_tuples = self.zip_input_files(on_error='abort')
        for n, (gt_file, ocr_file) in enumerate(input_file_tuples):
            if not gt_file or not ocr_file:
                # file/page was not found in this group
                continue
            gt_file = self.workspace.download_file(gt_file)
            ocr_file = self.workspace.download_file(ocr_file)
            page_id = gt_file.pageId

            log.info("INPUT FILES %i / %s↔ %s", n, gt_file, ocr_file)

            file_id = make_file_id(ocr_file, self.output_file_grp)
            report_prefix = os.path.join(self.output_file_grp, file_id)

            # Process the files
            try:
                os.mkdir(self.output_file_grp)
            except FileExistsError:
                pass
            cli_process(
                gt_file.local_filename,
                ocr_file.local_filename,
                report_prefix,
                metrics=metrics,
                textequiv_level=textequiv_level,
            )

            # Add reports to the workspace
            for report_suffix, mimetype in [
                [".html", "text/html"],
                [".json", "application/json"],
            ]:
                self.workspace.add_file(
                    ID=file_id + report_suffix,
                    file_grp=self.output_file_grp,
                    pageId=page_id,
                    mimetype=mimetype,
                    local_filename=report_prefix + report_suffix,
                )

            # Clear cache between files
            levenshtein_matrix_cache_clear()
コード例 #11
0
ファイル: task_sequence.py プロジェクト: wrznr/core
def run_tasks(mets, log_level, page_id, task_strs):
    resolver = Resolver()
    workspace = resolver.workspace_from_url(mets)
    log = getLogger('ocrd.task_sequence')
    tasks = [ProcessorTask.parse(task_str) for task_str in task_strs]

    for task in tasks:

        task.validate()

        # check input file groups are in mets
        for input_file_grp in task.input_file_grps:
            if not input_file_grp in workspace.mets.file_groups:
                raise Exception(
                    "Unmet requirement: expected input file group not contained in mets: %s"
                    % input_file_grp)

        for output_file_grp in task.output_file_grps:
            if output_file_grp in workspace.mets.file_groups:
                raise Exception(
                    "Conflict: output file group already contained in mets: %s"
                    % output_file_grp)

        log.info("Start processing task '%s'", task)

        # execute cli
        returncode = run_cli(task.executable,
                             mets,
                             resolver,
                             workspace,
                             log_level=log_level,
                             page_id=page_id,
                             input_file_grp=','.join(task.input_file_grps),
                             output_file_grp=','.join(task.output_file_grps),
                             parameter=task.parameter_path)

        # check return code
        if returncode != 0:
            raise Exception("%s exited with non-zero return value %s" %
                            (task.executable, returncode))

        log.info("Finished processing task '%s'", task)

        # reload mets
        workspace.reload_mets()

        # check output file groups are in mets
        for output_file_grp in task.output_file_grps:
            if not output_file_grp in workspace.mets.file_groups:
                raise Exception(
                    "Invalid state: expected output file group not in mets: %s"
                    % output_file_grp)
コード例 #12
0
ファイル: decode.py プロジェクト: ASVLeipzig/cor-asv-fst
 def _line_to_tokens(self, n_line):
     LOG = getLogger('processor.FSTCorrection')
     result = []
     n_words = n_line.get_Word()
     if not n_words:
         LOG.warning("Line '%s' contains no word", n_line.id)
     for n_word in n_words:
         n_textequivs = n_word.get_TextEquiv()
         if n_textequivs and n_textequivs[0].Unicode:
             result.append(n_textequivs[0].Unicode)
         else:
             LOG.warning("Word '%s' contains no text results", n_word.id)
     return result
コード例 #13
0
def run_cli(
    executable,
    mets_url=None,
    resolver=None,
    workspace=None,
    page_id=None,
    overwrite=None,
    log_level=None,
    input_file_grp=None,
    output_file_grp=None,
    parameter=None,
    working_dir=None,
):
    """
    Open a workspace and run a processor on the command line.

    If :py:attr:`workspace` is not none, reuse that. Otherwise, instantiate an
    :py:class:`~ocrd.Workspace` for :py:attr:`mets_url` (and :py:attr:`working_dir`)
    by using :py:meth:`ocrd.Resolver.workspace_from_url` (i.e. open or clone local workspace).

    Run the processor CLI :py:attr:`executable` on the workspace, passing:
    - the workspace,
    - :py:attr:`page_id`
    - :py:attr:`input_file_grp`
    - :py:attr:`output_file_grp`
    - :py:attr:`parameter` (after applying any :py:attr:`parameter_override` settings)

    (Will create output files and update the in the filesystem).

    Args:
        executable (string): Executable name of the module processor.
    """
    workspace = _get_workspace(workspace, resolver, mets_url, working_dir)
    args = [executable, '--working-dir', workspace.directory]
    args += ['--mets', mets_url]
    if log_level:
        args += ['--log-level', log_level]
    if page_id:
        args += ['--page-id', page_id]
    if input_file_grp:
        args += ['--input-file-grp', input_file_grp]
    if output_file_grp:
        args += ['--output-file-grp', output_file_grp]
    if parameter:
        args += ['--parameter', parameter]
    if overwrite:
        args += ['--overwrite']
    log = getLogger('ocrd.processor.helpers.run_cli')
    log.debug("Running subprocess '%s'", ' '.join(args))
    result = run(args, check=False)
    return result.returncode
コード例 #14
0
    def check_file_grp(workspace,
                       input_file_grp=None,
                       output_file_grp=None,
                       page_id=None,
                       report=None):
        """
        Return a report on whether input_file_grp is/are in workspace.mets and output_file_grp is/are not.
        To be run before processing

        Arguments:
            workspacec (Workspace) the workspace to validate
            input_file_grp (list|string)  list or comma-separated list of input file groups
            output_file_grp (list|string) list or comma-separated list of output file groups
            page_id (list|string) list or comma-separated list of page_ids to write to
        """
        if not report:
            report = ValidationReport()
        if isinstance(input_file_grp, str):
            input_file_grp = input_file_grp.split(
                ',') if input_file_grp else []
        if isinstance(output_file_grp, str):
            output_file_grp = output_file_grp.split(
                ',') if output_file_grp else []
        if page_id and isinstance(page_id, str):
            page_id = page_id.split(',')

        log = getLogger('ocrd.workspace_validator')
        log.debug("input_file_grp=%s output_file_grp=%s" %
                  (input_file_grp, output_file_grp))
        if input_file_grp:
            for grp in input_file_grp:
                if grp not in workspace.mets.file_groups:
                    report.add_error("Input fileGrp[@USE='%s'] not in METS!" %
                                     grp)
        if output_file_grp:
            for grp in output_file_grp:
                if grp in workspace.mets.file_groups:
                    if page_id:
                        for one_page_id in page_id:
                            if next(
                                    workspace.mets.find_files(
                                        fileGrp=grp, pageId=one_page_id),
                                    None):
                                report.add_error(
                                    "Output fileGrp[@USE='%s'] already contains output for page %s"
                                    % (grp, one_page_id))
                    else:
                        report.add_error(
                            "Output fileGrp[@USE='%s'] already in METS!" % grp)
        return report
コード例 #15
0
ファイル: document.py プロジェクト: bertsky/browse-ocrd
    def _page_for_id_lazy(self,
                          page_id: str,
                          file_group: str = None) -> Optional['Page']:
        log = getLogger('ocrd_browser.model.document.Document.page_for_id')
        if not page_id:
            return None
        page = LazyPage(self, page_id, file_group)
        if not page.file:
            log.warning(
                "No PAGE-XML and no image for page '{}' in fileGrp '{}'".
                format(page_id, file_group))
            return None

        return page
コード例 #16
0
 def launch(self, toolname: str, doc: Document,
            file: OcrdFile) -> Optional[Popen]:  # type: ignore[type-arg]
     if toolname in self.tools:
         return self.launch_tool(self.tools[toolname], doc, file)
     else:
         log = getLogger('ocrd_browser.util.launcher.Launcher.launch')
         log.error(
             'Tool "%s" not found in your config, to fix place the following section in your ocrd-browser.conf',
             toolname)
         log.error('[Tool %s]', toolname)
         log.error(
             'commandline = /usr/bin/yourtool --base-dir {workspace.directory} {file.path.absolute}'
         )
         return None
コード例 #17
0
def _repair_tokenisation(tokenisation, concatenation, next_token):
    LOG = getLogger('processor.KerasRate')
    # invariant: text should contain a representation that concatenates into actual tokenisation
    # ideally, both overlap (concatenation~tokenisation)
    i = 0
    for i in range(min(len(tokenisation), len(concatenation)), -1, -1):
        if concatenation[-i:] == tokenisation[:i]:
            break
    if i > 0 and tokenisation[i:].startswith(
            next_token):  # without white space?
        LOG.warning('Repairing tokenisation between "%s" and "%s"',
                    concatenation[-i:], next_token)
        return True  # repair by skipping space/newline here
    return False
コード例 #18
0
 def create_logmap_smlink(self, workspace):
     LOG = getLogger('OcrdAnybaseocrLayoutAnalyser')
     el_root = self.workspace.mets._tree.getroot()
     log_map = el_root.find('mets:structMap[@TYPE="LOGICAL"]', NS)
     if log_map is None:
         log_map = ET.SubElement(el_root, TAG_METS_STRUCTMAP)
         log_map.set('TYPE', 'LOGICAL')
     else:
         LOG.info('LOGICAL structMap already exists, adding to it')
     link = el_root.find('mets:structLink', NS)
     if link is None:
         link = ET.SubElement(el_root, TAG_METS_STRUCTLINK)
     self.link = link
     self.log_map = log_map
コード例 #19
0
def handle_oai_response(response):
    """
    In case of a valid OAI-Response, extract first METS-Entry-Data
    """
    log = getLogger('ocrd_models.utils.handle_oai_response')
    content_type = response.headers['Content-Type']
    if 'xml' in content_type or 'text' in content_type:
        content = response.content
        try:
            if is_oai_content(content):
                return extract_mets_from_oai_content(content)
        except ET.LxmlError as exc:
            log.warning("textual response but no xml: %s (%s)", content, exc)
    return response.content
コード例 #20
0
    def validate(filename=None,
                 ocrd_page=None,
                 ocrd_file=None,
                 page_textequiv_consistency='strict',
                 page_textequiv_strategy='first',
                 check_baseline=True,
                 check_coords=True):
        """
        Validates a PAGE file for consistency by filename, OcrdFile or passing OcrdPage directly.

        Arguments:
            filename (string): Path to PAGE
            ocrd_page (OcrdPage): OcrdPage instance
            ocrd_file (OcrdFile): OcrdFile instance wrapping OcrdPage
            page_textequiv_consistency (string): 'strict', 'lax', 'fix' or 'off'
            page_textequiv_strategy (string): Currently only 'first'
            check_baseline (bool): whether Baseline must be fully within TextLine/Coords
            check_coords (bool): whether *Region/TextLine/Word/Glyph must each be fully
                                 contained within Border/*Region/TextLine/Word, resp.

        Returns:
            report (:class:`ValidationReport`) Report on the validity
        """
        log = getLogger('ocrd.page_validator.validate')
        if ocrd_page:
            page = ocrd_page
            file_id = ocrd_page.get_pcGtsId()
        elif ocrd_file:
            page = page_from_file(ocrd_file)
            file_id = ocrd_file.ID
        elif filename:
            page = parse(filename, silence=True)
            file_id = filename
        else:
            raise Exception(
                "At least one of ocrd_page, ocrd_file or filename must be set")
        if page_textequiv_strategy not in ('first'):
            raise Exception("page_textequiv_strategy %s not implemented" %
                            page_textequiv_strategy)
        if page_textequiv_consistency not in ('strict', 'lax', 'fix', 'off'):
            raise Exception(
                "page_textequiv_consistency level %s not implemented" %
                page_textequiv_consistency)
        report = ValidationReport()
        log.info("Validating input file '%s'", file_id)
        validate_consistency(page, page_textequiv_consistency,
                             page_textequiv_strategy, check_baseline,
                             check_coords, report, file_id)
        return report
コード例 #21
0
    def process(self):
        LOG = getLogger('OcrdAnybaseocrLayoutAnalyser')
        if not tf.test.is_gpu_available():
            LOG.error("Your system has no CUDA installed. No GPU detected.")
            # sys.exit(1)
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)
        model_path = Path(self.parameter['model_path'])
        class_mapper_path = Path(self.parameter['class_mapping_path'])
        if not Path(model_path).is_file():
            LOG.error("""\
                Layout Classfication model was not found at '%s'. Make sure the `model_path` parameter
                points to the local model path.
                model can be downloaded from http://url
                """ % model_path)
            sys.exit(1)
        else:

            LOG.info('Loading model from file %s', model_path)
            model = self.create_model(str(model_path))
            # load the mapping
            pickle_in = open(str(class_mapper_path), "rb")
            class_indices = pickle.load(pickle_in)
            label_mapping = dict((v, k) for k, v in class_indices.items())

            # print("INPUT FILE HERE",self.input_files)
        for (n, input_file) in enumerate(self.input_files):
            pcgts = page_from_file(self.workspace.download_file(input_file))
            fname = pcgts.get_Page().imageFilename
            page_id = input_file.pageId or input_file.ID
            size = 600, 500

            self.add_metadata(pcgts)
            page = pcgts.get_Page()
            LOG.info("INPUT FILE %s", input_file.pageId or input_file.ID)

            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page, page_id, feature_selector='binarized')

            img_array = ocrolib.pil2array(
                page_image.resize((500, 600), Image.ANTIALIAS))
            img_array = img_array * 1. / 255.
            img_array = img_array[np.newaxis, :, :, np.newaxis]
            results = self.start_test(model, img_array, fname, label_mapping)
            LOG.info(results)
            self.workspace.mets.set_physical_page_for_file(
                "PHYS_000" + str(n), input_file)
            self.create_logmap_smlink(pcgts)
            self.write_to_mets(results, "PHYS_000" + str(n))
コード例 #22
0
ファイル: resolver.py プロジェクト: ulb-sachsen-anhalt/core
    def resolve_mets_arguments(self, directory, mets_url, mets_basename):
        """
        Resolve the ``--mets``, ``--mets-basename`` and `--directory`` argument
        into a coherent set of arguments according to https://github.com/OCR-D/core/issues/517
        """
        log = getLogger('ocrd.resolver.resolve_mets_arguments')
        mets_is_remote = mets_url and (mets_url.startswith('http://') or mets_url.startswith('https://'))

        # XXX we might want to be more strict like this but it might break # legacy code
        # Allow --mets and --directory together iff --mets is a remote URL
        # if directory and mets_url and not mets_is_remote:
        #     raise ValueError("Use either --mets or --directory, not both")

        # If --mets is a URL, a directory must be explicitly provided (not strictly necessary, but retained for legacy behavior)
        if not directory and mets_is_remote:
            raise ValueError("--mets is an http(s) URL but no --directory was given")

        # Determine --mets-basename
        if not mets_basename and mets_url:
            mets_basename = Path(mets_url).name
        elif not mets_basename and not mets_url:
            mets_basename = 'mets.xml'
        elif mets_basename and mets_url:
            raise ValueError("Use either --mets or --mets-basename, not both")
        else:
            warn("--mets-basename is deprecated. Use --mets/--directory instead", DeprecationWarning)

        # Determine --directory and --mets-url
        if not directory and not mets_url:
            directory = Path.cwd()
            mets_url = Path(directory, mets_basename)
        elif directory and not mets_url:
            directory = Path(directory).resolve()
            mets_url = directory / mets_basename
        elif not directory and mets_url:
            mets_url = Path(mets_url).resolve()
            directory = mets_url.parent
        else: # == directory and mets_url:
            directory = Path(directory).resolve()
            if not mets_is_remote:
                # --mets is just a basename and --directory is set, so treat --mets as --mets-basename
                if Path(mets_url).parent == Path('.'):
                    mets_url = directory / mets_url
                else:
                    mets_url = Path(mets_url).resolve()
                    if not is_file_in_directory(directory, mets_url):
                        raise ValueError("--mets '%s' has a directory part inconsistent with --directory '%s'" % (mets_url, directory))

        return str(Path(directory).resolve()), str(mets_url), str(mets_basename)
コード例 #23
0
ファイル: base.py プロジェクト: hefv57/core
def run_processor(
    processorClass,
    ocrd_tool=None,
    mets_url=None,
    resolver=None,
    workspace=None,
    page_id=None,
    log_level=None,  # TODO actually use this!
    input_file_grp=None,
    output_file_grp=None,
    parameter=None,
    working_dir=None,
):  # pylint: disable=too-many-locals
    """
    Create a workspace for mets_url and run processor through it

    Args:
        parameter (string): URL to the parameter
    """
    workspace = _get_workspace(workspace, resolver, mets_url, working_dir)
    log.debug("Running processor %s", processorClass)
    processor = processorClass(workspace,
                               ocrd_tool=ocrd_tool,
                               page_id=page_id,
                               input_file_grp=input_file_grp,
                               output_file_grp=output_file_grp,
                               parameter=parameter)
    ocrd_tool = processor.ocrd_tool
    name = '%s v%s' % (ocrd_tool['executable'], processor.version)
    otherrole = ocrd_tool['steps'][0]
    logProfile = getLogger('ocrd.process.profile')
    log.debug("Processor instance %s (%s doing %s)", processor, name,
              otherrole)
    t0 = time()
    processor.process()
    t1 = time() - t0
    logProfile.info(
        'Executing processor "%s" took %fs [--input-file-grp="%s" --output-file-grp="%s" --parameter="%s"]'
        %
        (ocrd_tool['executable'], t1, input_file_grp if input_file_grp else '',
         output_file_grp if output_file_grp else '',
         parameter if parameter else {}))
    workspace.mets.add_agent(name=name,
                             _type='OTHER',
                             othertype='SOFTWARE',
                             role='OTHER',
                             otherrole=otherrole)
    workspace.save_mets()
    return processor
コード例 #24
0
    def setup(self):
        LOG = getLogger('processor.AnybaseocrBlockSegmenter')
        #self.reading_order = []
        self.order = 0
        model_path = resource_filename(__name__, '../mrcnn')
        model_weights = Path(
            self.resolve_resource(
                self.parameter['block_segmentation_weights']))

        confidence = self.parameter['min_confidence']
        config = InferenceConfig(confidence)
        self.mrcnn_model = model.MaskRCNN(mode="inference",
                                          model_dir=str(model_path),
                                          config=config)
        self.mrcnn_model.load_weights(str(model_weights), by_name=True)
コード例 #25
0
ファイル: document.py プロジェクト: bertsky/browse-ocrd
 def _clone_workspace(cls, mets_url: Union[Path, str]) -> Workspace:
     """
     Clones a workspace (mets.xml and all used files) to a temporary directory for editing
     """
     log = getLogger(
         'ocrd_browser.model.document.Document._clone_workspace')
     mets_url = cls._strip_local(mets_url, disallow_remote=False)
     temporary_workspace = mkdtemp(prefix='browse-ocrd-clone-')
     cls.temporary_workspaces.append(temporary_workspace)
     # TODO download = False and lazy loading would be nice for responsiveness
     log.info("Cloning '%s' to '%s'", mets_url, temporary_workspace)
     workspace = Resolver().workspace_from_url(mets_url=mets_url,
                                               dst_dir=temporary_workspace,
                                               download=True)
     return workspace
コード例 #26
0
 def process(self):
     LOG = getLogger('ocrd.dummy')
     assert_file_grp_cardinality(self.input_file_grp, 1)
     assert_file_grp_cardinality(self.output_file_grp, 1)
     for input_file in self.input_files:
         input_file = self.workspace.download_file(input_file)
         file_id = make_file_id(input_file, self.output_file_grp)
         ext = MIME_TO_EXT.get(input_file.mimetype, '')
         local_filename = join(self.output_file_grp, file_id + ext)
         pcgts = page_from_file(self.workspace.download_file(input_file))
         pcgts.set_pcGtsId(file_id)
         self.add_metadata(pcgts)
         LOG.info("cp %s %s # %s -> %s", input_file.url, local_filename, input_file.ID, file_id)
         if input_file.mimetype == MIMETYPE_PAGE:
             # Source file is PAGE-XML: Write out in-memory PcGtsType
             self.workspace.add_file(
                 ID=file_id,
                 file_grp=self.output_file_grp,
                 pageId=input_file.pageId,
                 mimetype=input_file.mimetype,
                 local_filename=local_filename,
                 content=to_xml(pcgts).encode('utf-8'))
         else:
             # Source file is not PAGE-XML: Copy byte-by-byte
             with open(input_file.local_filename, 'rb') as f:
                 content = f.read()
                 self.workspace.add_file(
                     ID=file_id,
                     file_grp=self.output_file_grp,
                     pageId=input_file.pageId,
                     mimetype=input_file.mimetype,
                     local_filename=local_filename,
                     content=content)
             if input_file.mimetype.startswith('image/'):
                 # write out the PAGE-XML representation for this image
                 page_file_id = file_id + '_PAGE'
                 pcgts.set_pcGtsId(page_file_id)
                 pcgts.get_Page().set_imageFilename(local_filename)
                 page_filename = join(self.output_file_grp, file_id + '.xml')
                 LOG.info("Add PAGE-XML %s generated for %s at %s",
                          page_file_id, file_id, page_filename)
                 self.workspace.add_file(
                     ID=page_file_id,
                     file_grp=self.output_file_grp,
                     pageId=input_file.pageId,
                     mimetype=MIMETYPE_PAGE,
                     local_filename=page_filename,
                     content=to_xml(pcgts).encode('utf-8'))
コード例 #27
0
    def process(self):
        LOG = getLogger('OcrdAnybaseocrTextline')

        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        oplevel = self.parameter['operation_level']

        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID

            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page = pcgts.get_Page()
            LOG.info("INPUT FILE %s", input_file.pageId or input_file.ID)

            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page, page_id, feature_selector='binarized,deskewed')

            if oplevel == 'page':
                LOG.warning("Operation level should be region.")
                self._process_segment(page_image, page, None, page_xywh,
                                      page_id, input_file, n)

            else:
                regions = page.get_TextRegion()
                if not regions:
                    LOG.warning("Page '%s' contains no text regions", page_id)
                    continue
                for (k, region) in enumerate(regions):

                    region_image, region_xywh = self.workspace.image_from_segment(
                        region, page_image, page_xywh)

                    self._process_segment(region_image, page, region,
                                          region_xywh, region.id, input_file,
                                          k)

            file_id = make_file_id(input_file, self.output_file_grp)
            pcgts.set_pcGtsId(file_id)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts).encode('utf-8'))
コード例 #28
0
    def __init__(self, userdir=None, xdg_config_home=None, xdg_data_home=None):
        self.log = getLogger('ocrd.resource_manager')
        self.database = {}

        self._xdg_data_home = xdg_data_home
        self._xdg_config_home = xdg_config_home
        self._userdir = userdir
        self.user_list = Path(self.xdg_config_home, 'ocrd', 'resources.yml')

        self.load_resource_list(Path(RESOURCE_LIST_FILENAME))
        if not self.user_list.exists():
            if not self.user_list.parent.exists():
                self.user_list.parent.mkdir(parents=True)
            with open(str(self.user_list), 'w', encoding='utf-8') as f:
                f.write(RESOURCE_USER_LIST_COMMENT)
        self.load_resource_list(self.user_list)
コード例 #29
0
ファイル: workspace.py プロジェクト: MarcGrotheer/core
def workspace_init(ctx, clobber_mets, directory):
    """
    Create a workspace with an empty METS file in --directory.

    """
    LOG = getLogger('ocrd.cli.workspace.init')
    if directory:
        LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR init' instead of argument 'DIRECTORY' ('%s')" % directory))
        ctx.directory = directory
    workspace = ctx.resolver.workspace_from_nothing(
        directory=os.path.abspath(ctx.directory),
        mets_basename=basename(ctx.mets_url),
        clobber_mets=clobber_mets
    )
    workspace.save_mets()
    print(workspace.directory)
コード例 #30
0
def xmllint_format(xml):
    """
    Pretty-print XML like ``xmllint`` does.

    Arguments:
        xml (string): Serialized XML
    """
    log = getLogger('ocrd_models.utils.xmllint_format')
    parser = ET.XMLParser(resolve_entities=False,
                          strip_cdata=False,
                          remove_blank_text=True)
    document = ET.fromstring(xml, parser)
    return ('%s\n%s' %
            ('<?xml version="1.0" encoding="UTF-8"?>',
             ET.tostring(document, pretty_print=True,
                         encoding='UTF-8').decode('utf-8'))).encode('utf-8')