def __init__(self, process, ifgs, ifs): """Create a page alignment form a list of input files.""" self.process = process self.ifgs = ifgs self.ifs = ifs self.log = getLogger('PageAlignment') self.align_lines()
def add_files_to_mets(self, convention, mets, directory): """ Add files from folder to METS, accoding to a file structure convention. Args: convention (string) : Which file structure convention to adhere to. 'ocrd-gt' (Default):: Subfolder name ==> mets:fileGrp @USE 'page' => 'OCR-D-OCR-PAGE' 'alto' => 'OCR-D-OCR-ALTO' 'tei' => 'OCR-D-OCR-TEI' fileGrp + '_' + upper(Basename of file without extension) == mets:file @ID File in root folder == mets:fileGrp @USE == 'OCR-D-IMG' Extension ==> mets.file @MIMETYPE .tif => image/tif .png => image/png .jpg => image/jpg .xml => image/xml """ log = getLogger('ocrd.resolver.add_files_to_mets') # pylint: disable=redefined-outer-name log.debug("Reading files in '%s' according to '%s' convention", directory, convention) if convention == 'ocrd-gt': for root, dirs, files in os.walk(directory): dirname = root[len(directory):] if not dirname: fileGrp = 'OCR-D-IMG' elif '/' in dirname: del dirs[:] dirname = dirname[1:] fileGrp = dirname.upper() for f in files: if f == 'mets.xml': continue mimetype = 'application/octet-stream' for ext in EXT_TO_MIME: if f.endswith(ext): mimetype = EXT_TO_MIME[ext] break if dirname == 'alto': mimetype = 'application/alto+xml' fileGrp = 'OCR-D-OCR-ALTO' elif dirname == 'page': fileGrp = 'OCR-D-OCR-PAGE' local_filename = os.path.join(directory, dirname, f) x = mets.add_file( fileGrp, mimetype=mimetype, local_filename=local_filename, ID='_'.join([fileGrp, f.replace('.', '_')]).upper(), url='file://' + local_filename, ) log.debug("Added as %s", x)
def download_to_directory(self, directory, url, basename=None, overwrite=False, subdir=None): """ Download a file to the workspace. Early Shortcut: If url is a file://-URL and that file is already in the directory, keep it there. If basename is not given but subdir is, assume user knows what she's doing and use last URL segment as the basename. If basename is not given and no subdir is given, use the alnum characters in the URL as the basename. Args: directory (string): Directory to download files to basename (string, None): basename part of the filename on disk. url (string): URL to download from overwrite (boolean): Whether to overwrite existing files with that name subdir (boolean, None): Subdirectory to create within the directory. Think fileGrp. Returns: Local filename """ log = getLogger('ocrd.resolver.download_to_directory') # pylint: disable=redefined-outer-name log.debug("directory=|%s| url=|%s| basename=|%s| overwrite=|%s| subdir=|%s|", directory, url, basename, overwrite, subdir) if basename is None: if (subdir is not None) or \ (directory and url.startswith('file://%s' % directory)): # in case downloading a url 'file:///tmp/foo/bar' to directory '/tmp/foo' basename = url.rsplit('/', 1)[-1] else: basename = safe_filename(url) if subdir is not None: basename = os.path.join(subdir, basename) outfilename = os.path.join(directory, basename) if os.path.exists(outfilename) and not overwrite: log.debug("File already exists and overwrite=False: %s", outfilename) return outfilename outfiledir = outfilename.rsplit('/', 1)[0] # print(outfiledir) if not os.path.isdir(outfiledir): os.makedirs(outfiledir) log.debug("Downloading <%s> to '%s'", url, outfilename) if url.startswith('file://'): copyfile(url[len('file://'):], outfilename) else: response = requests.get(url) if response.status_code != 200: raise Exception("Not found: %s (HTTP %d)" % (url, response.status_code)) with open(outfilename, 'wb') as outfile: outfile.write(response.content) return outfilename
def download_to_directory(self, directory, url, basename=None, overwrite=False, subdir=None, prefer_symlink=None): """ Download a file to the workspace. If basename is not given but subdir is, assume user knows what she's doing and use last URL segment as the basename. If basename is not given and no subdir is given, use the alnum characters in the URL as the basename. Args: directory (string): Directory to download files to basename (string, None): basename part of the filename on disk. url (string): URL to download from overwrite (boolean): Whether to overwrite existing files with that name subdir (boolean, None): Subdirectory to create within the directory. Think fileGrp. prefer_symlink (boolean): Whether to use symlinks instead of copying. Overrides self.prefer_symlink Returns: Local filename """ log = getLogger('ocrd.resolver.download_to_directory') # pylint: disable=redefined-outer-name if basename is None: if subdir is not None: basename = url.rsplit('/', 1)[-1] else: basename = safe_filename(url) if subdir is not None: basename = os.path.join(subdir, basename) outfilename = os.path.join(directory, basename) if os.path.exists(outfilename) and not overwrite: log.debug("File already exists and overwrite=False: %s", outfilename) return outfilename outfiledir = outfilename.rsplit('/', 1)[0] # print(outfiledir) if not os.path.isdir(outfiledir): os.makedirs(outfiledir) cached_filename = self.cache.get(url) if self.cache_enabled else False if cached_filename: log.debug("Found cached version of <%s> at '%s'", url, cached_filename) self._copy_or_symlink(cached_filename, outfilename, prefer_symlink) else: log.debug("Downloading <%s> to '%s'", url, outfilename) if url.startswith('file://'): self._copy_or_symlink(url[len('file://'):], outfilename, prefer_symlink) else: with open(outfilename, 'wb') as outfile: response = requests.get(url) if response.status_code != 200: raise Exception("Not found: %s (HTTP %d)" % (url, response.status_code)) outfile.write(response.content) if self.cache_enabled and not cached_filename: cached_filename = self.cache.put(url, filename=outfilename) log.debug("Stored in cache <%s> at '%s'", url, cached_filename) return outfilename
import os from shutil import copyfile from zipfile import ZipFile import tempfile import requests from ocrd.constants import METS_XML_EMPTY, TMP_PREFIX, EXT_TO_MIME from ocrd.utils import getLogger, safe_filename from ocrd.resolver_cache import ResolverCache from ocrd.workspace import Workspace from ocrd.model import OcrdMets log = getLogger('ocrd.resolver') tempfile.tempdir = '/tmp' class Resolver(object): """ Handle Uploads, Downloads, Repository access and manage temporary directories Optionally cache files. Args: cache_enabled (Boolean): Whether to cache files. If True, passes kwargs to ~ResolverCache. prefer_symlink (Boolean): If True, symlink from cached file to the workspace instead of copying to reduce I/O. """ def __init__(self, cache_enabled=False, prefer_symlink=False, **kwargs): """ """ self.cache_enabled = cache_enabled self.prefer_symlink = prefer_symlink self.cache = ResolverCache(**kwargs) if cache_enabled else None
from __future__ import absolute_import from tesserocr import RIL, PSM, PyTessBaseAPI, get_languages, iterate_level from ocrd.utils import getLogger, concat_padded, xywh_from_points, points_from_x0y0x1y1 from ocrd.model.ocrd_page import from_file, to_xml, TextEquivType, CoordsType, GlyphType from ocrd import Processor, MIMETYPE_PAGE from ocrd_tesserocr.config import TESSDATA_PREFIX, OCRD_TOOL log = getLogger('processor.TesserocrRecognize') class TesserocrRecognize(Processor): def __init__(self, *args, **kwargs): kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-tesserocr-recognize'] kwargs['version'] = OCRD_TOOL['version'] super(TesserocrRecognize, self).__init__(*args, **kwargs) def process(self): """ Performs the (text) recognition. """ print(self.parameter) if self.parameter['textequiv_level'] not in ['line', 'glyph']: raise Exception("currently only implemented at the line/glyph level") model = get_languages()[1][-1] # last installed model if 'model' in self.parameter: model = self.parameter['model'] if model not in get_languages()[1]: raise Exception("configured model " + model + " is not installed") with PyTessBaseAPI(path=TESSDATA_PREFIX, lang=model) as tessapi: log.info("Using model %s in %s for recognition", model, get_languages()[0])
def __init__(self, *args, **kwargs): ocrd_tool = get_ocrd_tool() kwargs['ocrd_tool'] = ocrd_tool['tools']['ocrd-cis-align'] kwargs['version'] = ocrd_tool['version'] super(Aligner, self).__init__(*args, **kwargs) self.log = getLogger('Processor.Aligner')
import os from ocrd.constants import DEFAULT_CACHE_FOLDER from ocrd.utils import getLogger, safe_filename log = getLogger('ocrd.cache') class ResolverCache(object): """ Cache of downloads, based on URL. Args: cache_directory (string): Where to store cached files """ def __init__(self, cache_directory=DEFAULT_CACHE_FOLDER): """ Instantiate a cache """ self.directory = cache_directory if not os.path.isdir(self.directory): log.info("Cache directory does not exist, creating: '%s'", self.directory) os.makedirs(self.directory) def get(self, url): cached_filename = os.path.join(self.directory, safe_filename(url)) if os.path.exists(cached_filename): return cached_filename def put(self, url, filename=None, content=None):
def __init__(self, jar, main, input_str, args): self.jar = jar self.main = main self.input_str = input_str self.args = args self.log = getLogger('JavaProcess')
def __init__(self, *args, **kwargs): kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-typegroups-classifier'] kwargs['version'] = OCRD_TOOL['version'] super(TypegroupsClassifierProcessor, self).__init__(*args, **kwargs) self.log = getLogger('ocrd_typegroups_classifier')
import os import json import subprocess from deprecated.sphinx import deprecated from ocrd.utils import getLogger from ocrd.validator import ParameterValidator log = getLogger('ocrd.processor') def _get_workspace(workspace=None, resolver=None, mets_url=None, working_dir=None): if workspace is None: if resolver is None: raise Exception("Need to pass a resolver to create a workspace") if mets_url is None: raise Exception("Need to pass mets_url to create a workspace") workspace = resolver.workspace_from_url(mets_url, directory=working_dir) return workspace def run_processor( processorClass, ocrd_tool=None, mets_url=None, resolver=None, workspace=None, group_id=None,
import json import re from jsonschema import Draft4Validator, validators # pylint: disable=import-error from ocrd.constants import FILE_GROUP_CATEGORIES, FILE_GROUP_PREFIX, OCRD_TOOL_SCHEMA from ocrd.utils import getLogger log = getLogger('ocrd.validator') # http://python-jsonschema.readthedocs.io/en/latest/faq/ def extend_with_default(validator_class): validate_properties = validator_class.VALIDATORS["properties"] def set_defaults(validator, properties, instance, schema): for prop, subschema in properties.items(): if "default" in subschema: instance.setdefault(prop, subschema["default"]) for error in validate_properties(validator, properties, instance, schema): yield error return validators.extend(validator_class, {"properties": set_defaults}) DefaultValidatingDraft4Validator = extend_with_default(Draft4Validator) # # -------------------------------------------------
from __future__ import absolute_import import tesserocr from ocrd.utils import getLogger, concat_padded, points_from_xywh from ocrd.model.ocrd_page import (ReadingOrderType, RegionRefIndexedType, TextRegionType, CoordsType, OrderedGroupType, from_file, to_xml) from ocrd import Processor, MIMETYPE_PAGE from ocrd_tesserocr.config import TESSDATA_PREFIX, OCRD_TOOL log = getLogger('processor.TesserocrSegmentRegion') class TesserocrSegmentRegion(Processor): def __init__(self, *args, **kwargs): kwargs['ocrd_tool'] = OCRD_TOOL['tools'][ 'ocrd-tesserocr-segment-region'] kwargs['version'] = OCRD_TOOL['version'] super(TesserocrSegmentRegion, self).__init__(*args, **kwargs) def process(self): """ Performs the region segmentation. """ with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: print(self.input_file_grp) for (n, input_file) in enumerate(self.input_files): pcgts = from_file(self.workspace.download_file(input_file)) image = self.workspace.resolve_image_as_pil( pcgts.get_Page().imageFilename) log.debug("Detecting regions with tesseract")
import os import sys import shutil import cv2 from PIL import Image import numpy as np from ocrd.model import OcrdMets, OcrdExif from ocrd.utils import getLogger log = getLogger('ocrd.workspace') class Workspace(object): """ A workspace is a temporary directory set up for a processor. It's the interface to the METS/PAGE XML and delegates download and upload to the Resolver. Args: directory (string) : Folder to work in mets (:class:`OcrdMets`) : OcrdMets representing this workspace. Loaded from 'mets.xml' if ``None``. mets_basename (string) : Basename of the METS XML file. Default: Last URL segment of the mets_url. """ def __init__(self, resolver, directory, mets=None, mets_basename='mets.xml'): self.resolver = resolver
from __future__ import absolute_import import kraken from ocrd.utils import getLogger, mets_file_id from ocrd import Processor, OcrdPage, MIMETYPE_PAGE log = getLogger('processor.KrakenBinarize') class KrakenBinarize(Processor): def process(self): """ Performs the binarization. """ for (n, input_file) in enumerate(self.input_files): log.info("INPUT FILE %i / %s", n, input_file) self.workspace.download_file(input_file) page = OcrdPage.from_file(input_file) image_url = page.imageFileName log.info("page %s", page) for region in page.list_textregions(): textlines = region.list_textlines() log.info("About to binarize %i lines of region '%s'", len(textlines), region.ID) for (line_no, line) in enumerate(textlines): log.debug("Binarizing line '%s' in region '%s'", line_no, region.ID) image = self.workspace.resolve_image_as_pil( image_url, line.coords) bin_image = kraken.binarization.nlbin(image) ''' self.add_output_file(
from __future__ import absolute_import from tesserocr import PyTessBaseAPI, RIL from ocrd import Processor, MIMETYPE_PAGE from ocrd.utils import getLogger, concat_padded, points_from_xywh, polygon_from_points, xywh_from_points from ocrd.model.ocrd_page import (CoordsType, TextLineType, from_file, to_xml) from ocrd_tesserocr.config import TESSDATA_PREFIX, OCRD_TOOL log = getLogger('processor.TesserocrSegmentLine') class TesserocrSegmentLine(Processor): def __init__(self, *args, **kwargs): kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-tesserocr-segment-line'] kwargs['version'] = OCRD_TOOL['version'] super(TesserocrSegmentLine, self).__init__(*args, **kwargs) def process(self): """ Performs the line segmentation. """ with PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: for (n, input_file) in enumerate(self.input_files): pcgts = from_file(self.workspace.download_file(input_file)) image_url = pcgts.get_Page().imageFilename for region in pcgts.get_Page().get_TextRegion(): log.debug("Detecting lines in %s with tesseract", region.id) image = self.workspace.resolve_image_as_pil( image_url, polygon_from_points(region.get_Coords().points))
from __future__ import absolute_import from tesserocr import RIL, PyTessBaseAPI, OEM, PSM from ocrd import Processor, MIMETYPE_PAGE from ocrd.utils import getLogger, concat_padded, points_from_xywh, polygon_from_points, xywh_from_points from ocrd.model.ocrd_page import (CoordsType, WordType, from_file, to_xml) from ocrd_tesserocr.config import TESSDATA_PREFIX, OCRD_TOOL log = getLogger('processor.TesserocrSegmentWord') class TesserocrSegmentWord(Processor): def __init__(self, *args, **kwargs): kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-tesserocr-segment-word'] kwargs['version'] = OCRD_TOOL['version'] super(TesserocrSegmentWord, self).__init__(*args, **kwargs) def process(self): """ Performs the line segmentation. """ with PyTessBaseAPI( psm=PSM.SINGLE_LINE, path=TESSDATA_PREFIX, ) as tessapi: for (n, input_file) in enumerate(self.input_files): pcgts = from_file(self.workspace.download_file(input_file)) image_url = pcgts.get_Page().imageFilename for region in pcgts.get_Page().get_TextRegion(): for line in region.get_TextLine(): log.debug("Detecting words in line '%s'", line.id)
import re from ocrd.constants import FILE_GROUP_CATEGORIES, FILE_GROUP_PREFIX from .report import ValidationReport from ocrd.utils import getLogger log = getLogger('ocrd.workspace_validator') # # ------------------------------------------------- # class WorkspaceValidator(object): """ Validates an OCR-D/METS workspace against the specs. Args: resolver (:class:`Resolver`) : Instance of a resolver mets_url (string) : URL of the METS file """ def __init__(self, resolver, mets_url, directory=None): self.resolver = resolver self.mets_url = mets_url self.report = ValidationReport() log.debug('resolver=%s mets_url=%s directory=%s', resolver, mets_url, directory) if mets_url is None: mets_url = '%s/mets.xml' % directory self.workspace = self.resolver.workspace_from_url(mets_url, directory=directory)