def workspace_from_url(self, mets_url, dst_dir=None, clobber_mets=False, mets_basename=None, download=False, src_baseurl=None): """ Create a workspace from a METS by URL (i.e. clone it). Sets the mets.xml file Arguments: mets_url (string): Source mets URL dst_dir (string, None): Target directory for the workspace clobber_mets (boolean, False): Whether to overwrite existing mets.xml. By default existing mets.xml will raise an exception. download (boolean, False): Whether to download all the files src_baseurl (string, None): Base URL for resolving relative file locations Returns: Workspace """ if mets_url is None: raise ValueError("Must pass 'mets_url' workspace_from_url") # if mets_url is a relative filename, make it absolute if is_local_filename(mets_url) and not Path(mets_url).is_absolute(): mets_url = str(Path(Path.cwd() / mets_url)) # if mets_basename is not given, use the last URL segment of the mets_url if mets_basename is None: mets_basename = nth_url_segment(mets_url, -1) # If src_baseurl wasn't given, determine from mets_url by removing last url segment if not src_baseurl: last_segment = nth_url_segment(mets_url) src_baseurl = remove_non_path_from_url(remove_non_path_from_url(mets_url)[:-len(last_segment)]) # resolve dst_dir if not dst_dir: if is_local_filename(mets_url): log.debug("Deriving dst_dir %s from %s", Path(mets_url).parent, mets_url) dst_dir = Path(mets_url).parent else: log.debug("Creating ephemeral workspace '%s' for METS @ <%s>", dst_dir, mets_url) dst_dir = tempfile.mkdtemp(prefix=TMP_PREFIX) # XXX Path.resolve is always strict in Python <= 3.5, so create dst_dir unless it exists consistently if not Path(dst_dir).exists(): Path(dst_dir).mkdir(parents=True, exist_ok=False) dst_dir = str(Path(dst_dir).resolve()) log.debug("workspace_from_url\nmets_basename='%s'\nmets_url='%s'\nsrc_baseurl='%s'\ndst_dir='%s'", mets_basename, mets_url, src_baseurl, dst_dir) self.download_to_directory(dst_dir, mets_url, basename=mets_basename, if_exists='overwrite' if clobber_mets else 'raise') workspace = Workspace(self, dst_dir, mets_basename=mets_basename, baseurl=src_baseurl) if download: for f in workspace.mets.find_files(): workspace.download_file(f) return workspace
def _validate_imagefilename(self): """ Validate that the imageFilename is correctly set to a filename relative to the workspace """ self.log.debug('_validate_imagefilename') for f in self.mets.find_files(mimetype=MIMETYPE_PAGE): if not is_local_filename(f.url) and not self.download: self.report.add_notice("Won't download remote PAGE XML <%s>" % f.url) continue self.workspace.download_file(f) page = page_from_file(f).get_Page() imageFilename = page.imageFilename if not self.mets.find_files(url=imageFilename): self.report.add_error("PAGE-XML %s : imageFilename '%s' not found in METS" % (f.url, imageFilename)) if is_local_filename(imageFilename) and not Path(imageFilename).exists(): self.report.add_warning("PAGE-XML %s : imageFilename '%s' points to non-existent local file" % (f.url, imageFilename))
def bashlib_input_files(**kwargs): """ List input files for processing Instantiate a processor and workspace from the given processing options. Then loop through the input files of the input fileGrp, and for each one, print its `url`, `ID`, `mimetype` and `pageId`, as well as its recommended `outputFileId` (from ``make_file_id``). (The printing format is one associative array initializer per line.) """ initLogging() mets = kwargs.pop('mets') working_dir = kwargs.pop('working_dir') if is_local_filename(mets) and not isfile(get_local_filename(mets)): msg = "File does not exist: %s" % mets raise Exception(msg) resolver = Resolver() workspace = resolver.workspace_from_url(mets, working_dir) processor = Processor(workspace, ocrd_tool=None, page_id=kwargs['page_id'], input_file_grp=kwargs['input_file_grp'], output_file_grp=kwargs['output_file_grp']) for input_file in processor.input_files: for field in ['url', 'ID', 'mimetype', 'pageId']: # make this bash-friendly (show initialization for associative array) print("[%s]='%s'" % (field, getattr(input_file, field)), end=' ') print("[outputFileId]='%s'" % make_file_id(input_file, kwargs['output_file_grp']))
def ocrd_cli_wrap_processor(processorClass, ocrd_tool=None, mets=None, working_dir=None, dump_json=False, version=False, **kwargs): LOG = getLogger('ocrd_cli_wrap_processor') if dump_json: processorClass(workspace=None, dump_json=True) elif version: try: p = processorClass(workspace=None) except e: pass print("Version %s, ocrd/core %s" % (p.version, OCRD_VERSION)) elif mets is None: msg = 'Error: Missing option "-m" / "--mets".' LOG.error(msg) raise Exception(msg) else: if is_local_filename(mets) and not isfile(get_local_filename(mets)): msg = "File does not exist: %s" % mets LOG.error(msg) raise Exception(msg) resolver = Resolver() workspace = resolver.workspace_from_url(mets, working_dir) run_processor(processorClass, ocrd_tool, mets, workspace=workspace, **kwargs)
def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None, local_only=False): """ Search ``mets:file`` in this METS document. Args: ID (string) : ID of the file fileGrp (string) : USE of the fileGrp to list files of pageId (string) : ID of physical page manifested by matching files url (string) : @xlink:href of mets:Flocat of mets:file mimetype (string) : MIMETYPE of matching files local (boolean) : Whether to restrict results to local files Return: List of files. """ ret = [] fileGrp_clause = '' if fileGrp is None else '[@USE="%s"]' % fileGrp file_clause = '' if ID is not None: file_clause += '[@ID="%s"]' % ID if mimetype is not None: file_clause += '[@MIMETYPE="%s"]' % mimetype if url is not None: file_clause += '[mets:FLocat[@xlink:href = "%s"]]' % url # TODO lxml says invalid predicate. I disagree # if local_only: # file_clause += "[mets:FLocat[starts-with(@xlink:href, 'file://')]]" # Search file_ids = self._tree.getroot().xpath( "//mets:fileGrp%s/mets:file%s/@ID" % (fileGrp_clause, file_clause), namespaces=NS) if pageId is not None: by_pageid = self._tree.getroot().xpath( '//mets:div[@TYPE="page"][@ID="%s"]/mets:fptr/@FILEID' % pageId, namespaces=NS) file_ids = [i for i in by_pageid if i in file_ids] # instantiate / get from cache for file_id in file_ids: el = self._tree.getroot().find('.//mets:file[@ID="%s"]' % file_id, NS) if file_id not in self._file_by_id: self._file_by_id[file_id] = OcrdFile(el, mets=self) # If only local resources should be returned and file is not a file path: skip the file url = self._file_by_id[file_id].url if local_only and not is_local_filename(url): continue ret.append(self._file_by_id[file_id]) return ret
def _validate_dimension(self): """ Validate image height and PAGE imageHeight match """ self.log.info('_validate_dimension') for f in self.mets.find_files(mimetype=MIMETYPE_PAGE): if not is_local_filename(f.url) and not self.download: self.report.add_notice("_validate_dimension: Not executed because --download wasn't set and PAGE might reference remote (Alternative)Images <%s>" % f.url) continue page = page_from_file(f).get_Page() _, _, exif = self.workspace.image_from_page(page, f.pageId) if page.imageHeight != exif.height: self.report.add_error("PAGE '%s': @imageHeight != image's actual height (%s != %s)" % (f.ID, page.imageHeight, exif.height)) if page.imageWidth != exif.width: self.report.add_error("PAGE '%s': @imageWidth != image's actual width (%s != %s)" % (f.ID, page.imageWidth, exif.width))
def _validate_pixel_density(self): """ Validate image pixel density See `spec <https://ocr-d.github.io/mets#pixel-density-of-images-must-be-explicit-and-high-enough>`_. """ self.log.debug('_validate_pixel_density') for f in [f for f in self.mets.find_files() if f.mimetype.startswith('image/')]: if not is_local_filename(f.url) and not self.download: self.report.add_notice("Won't download remote image <%s>" % f.url) continue exif = self.workspace.resolve_image_exif(f.url) for k in ['xResolution', 'yResolution']: v = exif.__dict__.get(k) if v is None or v <= 72: self.report.add_notice("Image %s: %s (%s pixels per %s) is suspiciously low" % (f.ID, k, v, exif.resolutionUnit))
def _validate_multipage(self): """ Validate the number of images per file is 1 (TIFF allows multi-page images) See `spec <https://ocr-d.github.io/mets#no-multi-page-images>`_. """ for f in [ f for f in self.mets.find_files() if f.mimetype.startswith('image/') ]: if not is_local_filename(f.url) and not self.download: self.report.add_notice("Won't download remote image <%s>" % f.url) continue exif = self.workspace.resolve_image_exif(f.url) if exif.n_frames > 1: self.report.add_error("Image %s: More than 1 frame: %s" % (f.ID, exif.n_frames))
def _bag_mets_files(self, workspace, bagdir, ocrd_manifestation_depth, ocrd_mets, processes): mets = workspace.mets # TODO allow filtering by fileGrp@USE and such oldpwd = getcwd() chdir(workspace.directory) for f in mets.find_files(): log.info("Resolving %s (%s)", f.url, ocrd_manifestation_depth) if is_local_filename(f.url): f.url = abspath(f.url) # XXX cannot happen because chdir above # elif is_local_filename(join(workspace.directory, 'data', f.url)): # f.url = abspath(join(workspace.directory, 'data', f.url)) elif ocrd_manifestation_depth != 'full': self._log_or_raise( "Not fetching non-local files, skipping %s" % f.url, oldpwd) continue elif not f.url.startswith('http'): self._log_or_raise("Not an http URL: %s" % f.url, oldpwd) continue log.info("Resolved %s", f.url) file_grp_dir = join(bagdir, 'data', f.fileGrp) if not isdir(file_grp_dir): makedirs(file_grp_dir) self.resolver.download_to_directory(file_grp_dir, f.url, basename=f.ID) f.url = join(f.fileGrp, f.ID) # save mets.xml with open(join(bagdir, 'data', ocrd_mets), 'wb') as f: f.write(workspace.mets.to_xml()) chdir(bagdir) total_bytes, total_files = make_manifests('data', processes, algorithms=['sha512']) chdir(oldpwd) return total_bytes, total_files
def ocrd_cli_wrap_processor(processorClass, ocrd_tool=None, mets=None, working_dir=None, dump_json=False, help=False, version=False, **kwargs): LOG = getLogger('ocrd_cli_wrap_processor') if dump_json: processorClass(workspace=None, dump_json=True) elif help: processorClass(workspace=None, show_help=True) elif version: processorClass(workspace=None, show_version=True) elif mets is None: msg = 'Error: Missing option "-m" / "--mets".' LOG.error(msg) raise Exception(msg) else: if is_local_filename(mets) and not isfile(get_local_filename(mets)): msg = "File does not exist: %s" % mets LOG.error(msg) raise Exception(msg) resolver = Resolver() workspace = resolver.workspace_from_url(mets, working_dir) # TODO once we implement 'overwrite' CLI option and mechanism, disable the # `output_file_grp_ check by setting to False-y value if 'overwrite' is set report = WorkspaceValidator.check_file_grp(workspace, kwargs['input_file_grp'], kwargs['output_file_grp']) if not report.is_valid: raise Exception("Invalid input/output file grps:\n\t%s" % '\n\t'.join(report.errors)) run_processor(processorClass, ocrd_tool, mets, workspace=workspace, **kwargs)
def __init__(self, el, mimetype=None, pageId=None, loctype='OTHER', local_filename=None, mets=None, url=None, ID=None): """ Args: el (LxmlElement): etree Element of the ``mets:file`` this represents. Create new if not provided Keyword Args: mets (OcrdMets): Containing :py:class:`ocrd_models.ocrd_mets.OcrdMets`. mimetype (string): ``@MIMETYPE`` of this ``mets:file`` pageId (string): ``@ID`` of the physical ``mets:structMap`` entry corresponding to this ``mets:file`` loctype (string): ``@LOCTYPE`` of this ``mets:file`` local_filename (string): Local filename url (string): ``@xlink:href`` of this ``mets:file`` ID (string): ``@ID`` of this ``mets:file`` """ if el is None: raise ValueError( "Must provide mets:file element this OcrdFile represents") self._el = el self.mets = mets self.ID = ID self.mimetype = mimetype self.local_filename = local_filename self.loctype = loctype self.pageId = pageId if url: self.url = url if not (local_filename): if self.url and is_local_filename(self.url): self.local_filename = get_local_filename(self.url)
def download_file(self, f): """ Download a :py:mod:`ocrd.model.ocrd_file.OcrdFile` to the workspace. """ # os.chdir(self.directory) # log.info('f=%s' % f) oldpwd = os.getcwd() try: os.chdir(self.directory) if is_local_filename(f.url): f.local_filename = abspath(f.url) else: if f.local_filename: log.debug("Already downloaded: %s", f.local_filename) else: f.local_filename = self.download_url(f.url, basename='%s/%s' % (f.fileGrp, f.ID)) finally: os.chdir(oldpwd) # print(f) return f
def __init__(self, el, mimetype=None, pageId=None, loctype='OTHER', local_filename=None, mets=None, url=None, ID=None): """ Args: el (LxmlElement): etree Element of the mets:file this represents. Create new if not provided mimetype (string): MIME type of the file pageId (string): ID of the physical page loctype (string): METS @LOCTYPE local_filename (string): Local filename mets (OcrdMets): Containing OcrdMets url (string): xlink:href of the file ID (string): @ID of the mets:file """ if el is None: el = ET.Element(TAG_METS_FILE) self._el = el self.mets = mets self.ID = ID self.mimetype = mimetype self.local_filename = local_filename self.loctype = loctype self.pageId = pageId if url: self.url = url if not (local_filename): if self.url and is_local_filename(self.url): self.local_filename = get_local_filename(self.url)
def ocrd_cli_wrap_processor( processorClass, ocrd_tool=None, mets=None, working_dir=None, dump_json=False, help=False, # pylint: disable=redefined-builtin version=False, overwrite=False, **kwargs): if not sys.argv[1:]: processorClass(workspace=None, show_help=True) sys.exit(1) if dump_json or help or version: processorClass(workspace=None, dump_json=dump_json, show_help=help, show_version=version) sys.exit() else: initLogging() LOG = getLogger('ocrd_cli_wrap_processor') # LOG.info('kwargs=%s' % kwargs) # Merge parameter overrides and parameters if 'parameter_override' in kwargs: set_json_key_value_overrides(kwargs['parameter'], *kwargs['parameter_override']) # TODO OCR-D/core#274 # Assert -I / -O # if not kwargs['input_file_grp']: # raise ValueError('-I/--input-file-grp is required') # if not kwargs['output_file_grp']: # raise ValueError('-O/--output-file-grp is required') if is_local_filename(mets) and not isfile(get_local_filename(mets)): msg = "File does not exist: %s" % mets LOG.error(msg) raise Exception(msg) resolver = Resolver() workspace = resolver.workspace_from_url(mets, working_dir) page_id = kwargs.get('page_id') # XXX not possible while processors do not adhere to # https://github.com/OCR-D/core/issues/505 # if overwrite # if 'output_file_grp' not in kwargs or not kwargs['output_file_grp']: # raise Exception("--overwrite requires --output-file-grp") # LOG.info("Removing files because of --overwrite") # for grp in kwargs['output_file_grp'].split(','): # if page_id: # for one_page_id in kwargs['page_id'].split(','): # LOG.debug("Removing files in output file group %s with page ID %s", grp, one_page_id) # for file in workspace.mets.find_files(pageId=one_page_id, fileGrp=grp): # workspace.remove_file(file, force=True, keep_file=False, page_recursive=True) # else: # LOG.debug("Removing all files in output file group %s ", grp) # # TODO: can be reduced to `page_same_group=True` as soon as core#505 has landed (in all processors) # workspace.remove_file_group(grp, recursive=True, force=True, keep_files=False, page_recursive=True, page_same_group=False) # workspace.save_mets() # XXX While https://github.com/OCR-D/core/issues/505 is open, set 'overwrite_mode' globally on the workspace if overwrite: workspace.overwrite_mode = True report = WorkspaceValidator.check_file_grp( workspace, kwargs['input_file_grp'], '' if overwrite else kwargs['output_file_grp'], page_id) if not report.is_valid: raise Exception("Invalid input/output file grps:\n\t%s" % '\n\t'.join(report.errors)) run_processor(processorClass, ocrd_tool, mets, workspace=workspace, **kwargs)
def download_to_directory(self, directory, url, basename=None, if_exists='skip', subdir=None): """ Download a file to a directory. Early Shortcut: If url is a local file and that file is already in the directory, keep it there. If basename is not given but subdir is, assume user knows what she's doing and use last URL segment as the basename. If basename is not given and no subdir is given, use the alnum characters in the URL as the basename. Args: directory (string): Directory to download files to basename (string, None): basename part of the filename on disk. url (string): URL to download from if_exists (string, "skip"): What to do if target file already exists. One of ``skip`` (default), ``overwrite`` or ``raise`` subdir (string, None): Subdirectory to create within the directory. Think fileGrp. Returns: Local filename, __relative__ to directory """ log = getLogger('ocrd.resolver.download_to_directory') # pylint: disable=redefined-outer-name log.debug("directory=|%s| url=|%s| basename=|%s| if_exists=|%s| subdir=|%s|", directory, url, basename, if_exists, subdir) if not url: raise Exception("'url' must be a string") if not directory: raise Exception("'directory' must be a string") # acutally Path would also work directory = Path(directory) directory.mkdir(parents=True, exist_ok=True) directory = str(directory.resolve()) subdir_path = Path(subdir if subdir else '') basename_path = Path(basename if basename else nth_url_segment(url)) ret = str(Path(subdir_path, basename_path)) dst_path = Path(directory, ret) # log.info("\n\tdst_path='%s \n\turl=%s", dst_path, url) # print('url=%s', url) # print('directory=%s', directory) # print('subdir_path=%s', subdir_path) # print('basename_path=%s', basename_path) # print('ret=%s', ret) # print('dst_path=%s', dst_path) src_path = None if is_local_filename(url): try: # XXX this raises FNFE in Python 3.5 if src_path doesn't exist but not 3.6+ src_path = Path(get_local_filename(url)).resolve() except FileNotFoundError as e: log.error("Failed to resolve URL locally: %s --> '%s' which does not exist" % (url, src_path)) raise e if not src_path.exists(): raise FileNotFoundError("File path passed as 'url' to download_to_directory does not exist: %s" % url) if src_path == dst_path: log.debug("Stop early, src_path and dst_path are the same: '%s' (url: '%s')" % (src_path, url)) return ret # Respect 'if_exists' arg if dst_path.exists(): if if_exists == 'skip': return ret if if_exists == 'raise': raise FileExistsError("File already exists and if_exists == 'raise': %s" % (dst_path)) # Create dst_path parent dir dst_path.parent.mkdir(parents=True, exist_ok=True) # Copy files or download remote assets if src_path: log.debug("Copying file '%s' to '%s'", src_path, dst_path) dst_path.write_bytes(src_path.read_bytes()) else: log.debug("Downloading URL '%s' to '%s'", url, dst_path) response = requests.get(url) if response.status_code != 200: raise Exception("HTTP request failed: %s (HTTP %d)" % (url, response.status_code)) dst_path.write_bytes(response.content) return ret
def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None, local_only=False): """ Search ``mets:file`` entries in this METS document and yield results. The :py:attr:`ID`, :py:attr:`fileGrp`, :py:attr:`url` and :py:attr:`mimetype` parameters can each be either a literal string, or a regular expression if the string starts with ``//`` (double slash). If it is a regex, the leading ``//`` is removed and candidates are matched against the regex with `re.fullmatch`. If it is a literal string, comparison is done with string equality. The :py:attr:`pageId` parameter supports the numeric range operator ``..``. For example, to find all files in pages ``PHYS_0001`` to ``PHYS_0003``, ``PHYS_0001..PHYS_0003`` will be expanded to ``PHYS_0001,PHYS_0002,PHYS_0003``. Keyword Args: ID (string) : ``@ID`` of the ``mets:file`` fileGrp (string) : ``@USE`` of the ``mets:fileGrp`` to list files of pageId (string) : ``@ID`` of the corresponding physical ``mets:structMap`` entry (physical page) url (string) : ``@xlink:href`` (URL or path) of ``mets:Flocat`` of ``mets:file`` mimetype (string) : ``@MIMETYPE`` of ``mets:file`` local (boolean) : Whether to restrict results to local files in the filesystem Yields: :py:class:`ocrd_models:ocrd_file:OcrdFile` instantiations """ ret = [] if pageId: if pageId.startswith(REGEX_PREFIX): raise Exception("find_files does not support regex search for pageId") pageIds, pageId = pageId.split(','), list() pageIds_expanded = [] for pageId_ in pageIds: if '..' in pageId_: pageIds_expanded += generate_range(*pageId_.split('..', 2)) pageIds += pageIds_expanded for page in self._tree.getroot().xpath( '//mets:div[@TYPE="page"]', namespaces=NS): if page.get('ID') in pageIds: pageId.extend( [fptr.get('FILEID') for fptr in page.findall('mets:fptr', NS)]) for cand in self._tree.getroot().xpath('//mets:file', namespaces=NS): if ID: if ID.startswith(REGEX_PREFIX): if not fullmatch(ID[REGEX_PREFIX_LEN:], cand.get('ID')): continue else: if not ID == cand.get('ID'): continue if pageId is not None and cand.get('ID') not in pageId: continue if fileGrp: if fileGrp.startswith(REGEX_PREFIX): if not fullmatch(fileGrp[REGEX_PREFIX_LEN:], cand.getparent().get('USE')): continue else: if cand.getparent().get('USE') != fileGrp: continue if mimetype: if mimetype.startswith(REGEX_PREFIX): if not fullmatch(mimetype[REGEX_PREFIX_LEN:], cand.get('MIMETYPE') or ''): continue else: if cand.get('MIMETYPE') != mimetype: continue if url: cand_locat = cand.find('mets:FLocat', namespaces=NS) if cand_locat is None: continue cand_url = cand_locat.get('{%s}href' % NS['xlink']) if url.startswith(REGEX_PREFIX): if not fullmatch(url[REGEX_PREFIX_LEN:], cand_url): continue else: if cand_url != url: continue f = OcrdFile(cand, mets=self) # If only local resources should be returned and f is not a file path: skip the file if local_only and not is_local_filename(f.url): continue yield f
def test_is_local_filename(self): self.assertEqual(is_local_filename('file:///'), True)
def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None, local_only=False): """ Search ``mets:file`` in this METS document. The ``ID``, ``fileGrp``, ``url`` and ``mimetype`` parameters can be either a literal string or a regular expression if the string starts with ``//`` (double slash). If it is a regex, the leading ``//`` is removed and candidates are matched against the regex with ``re.fullmatch``. If it is a literal string, comparison is done with string equality. Args: ID (string) : ID of the file fileGrp (string) : USE of the fileGrp to list files of pageId (string) : ID of physical page manifested by matching files url (string) : @xlink:href of mets:Flocat of mets:file mimetype (string) : MIMETYPE of matching files local (boolean) : Whether to restrict results to local files Return: List of files. """ ret = [] if pageId: if pageId.startswith(REGEX_PREFIX): raise Exception( "find_files does not support regex search for pageId") pageIds, pageId = pageId.split(','), list() for page in self._tree.getroot().xpath('//mets:div[@TYPE="page"]', namespaces=NS): if page.get('ID') in pageIds: pageId.extend([ fptr.get('FILEID') for fptr in page.findall('mets:fptr', NS) ]) for cand in self._tree.getroot().xpath('//mets:file', namespaces=NS): if ID: if ID.startswith(REGEX_PREFIX): if not fullmatch(ID[REGEX_PREFIX_LEN:], cand.get('ID')): continue else: if not ID == cand.get('ID'): continue if pageId is not None and cand.get('ID') not in pageId: continue if fileGrp: if fileGrp.startswith(REGEX_PREFIX): if not fullmatch(fileGrp[REGEX_PREFIX_LEN:], cand.getparent().get('USE')): continue else: if cand.getparent().get('USE') != fileGrp: continue if mimetype: if mimetype.startswith(REGEX_PREFIX): if not fullmatch(mimetype[REGEX_PREFIX_LEN:], cand.get('MIMETYPE') or ''): continue else: if cand.get('MIMETYPE') != mimetype: continue if url: cand_url = cand.find('mets:FLocat', namespaces=NS).get( '{%s}href' % NS['xlink']) if url.startswith(REGEX_PREFIX): if not fullmatch(url[REGEX_PREFIX_LEN:], cand_url): continue else: if cand_url != url: continue f = OcrdFile(cand, mets=self) # If only local resources should be returned and f is not a file path: skip the file if local_only and not is_local_filename(f.url): continue ret.append(f) return ret
def test_is_local_filename(self): self.assertTrue(is_local_filename('/foo/bar')) self.assertTrue(is_local_filename('file:///foo/bar')) self.assertTrue(is_local_filename('file:/foo/bar')) self.assertTrue(is_local_filename('foo/bar')) self.assertFalse(is_local_filename('bad-scheme://foo/bar'))
def _bag_mets_files(self, workspace, bagdir, ocrd_manifestation_depth, ocrd_mets, processes): mets = workspace.mets changed_urls = {} # TODO allow filtering by fileGrp@USE and such with pushd_popd(workspace.directory): # URLs of the files before changing for f in mets.find_files(): log.info("Resolving %s (%s)", f.url, ocrd_manifestation_depth) if is_local_filename(f.url): # nothing to do then pass elif ocrd_manifestation_depth != 'full': self._log_or_raise( "Not fetching non-local files, skipping %s" % f.url) continue elif not f.url.startswith('http'): self._log_or_raise("Not an http URL: %s" % f.url) continue log.info("Resolved %s", f.url) file_grp_dir = join(bagdir, 'data', f.fileGrp) if not isdir(file_grp_dir): makedirs(file_grp_dir) _basename = "%s%s" % (f.ID, f.extension) _relpath = join(f.fileGrp, _basename) self.resolver.download_to_directory(file_grp_dir, f.url, basename=_basename) changed_urls[f.url] = _relpath f.url = _relpath # save mets.xml with open(join(bagdir, 'data', ocrd_mets), 'wb') as f: f.write(workspace.mets.to_xml()) # Walk through bagged workspace and fix the PAGE # Page/@imageFilename and # AlternativeImage/@filename bag_workspace = Workspace(self.resolver, directory=join(bagdir, 'data')) with pushd_popd(bag_workspace.directory): for page_file in bag_workspace.mets.find_files( mimetype=MIMETYPE_PAGE): pcgts = page_from_file(page_file) changed = False # page_doc.set(imageFileName # for old, new in changed_urls: for old, new in changed_urls.items(): if pcgts.get_Page().imageFilename == old: pcgts.get_Page().imageFilename = new changed = True # TODO replace AlternativeImage, recursively... if changed: with open(page_file.url, 'w') as out: out.write(to_xml(pcgts)) # log.info("Replace %s -> %s in %s" % (old, new, page_file)) chdir(bagdir) total_bytes, total_files = make_manifests('data', processes, algorithms=['sha512']) log.info("New vs. old: %s" % changed_urls) return total_bytes, total_files
def workspace_from_url(self, mets_url, dst_dir=None, clobber_mets=False, mets_basename=None, download=False, src_baseurl=None): """ Create a workspace from a METS by URL (i.e. clone if :py:attr:`mets_url` is remote or :py:attr:`dst_dir` is given). Arguments: mets_url (string): Source METS URL or filesystem path Keyword Arguments: dst_dir (string, None): Target directory for the workspace. \ By default create a temporary directory under :py:data:`ocrd.constants.TMP_PREFIX`. \ (The resulting path can be retrieved via :py:attr:`ocrd.Workspace.directory`.) clobber_mets (boolean, False): Whether to overwrite existing ``mets.xml``. \ By default existing ``mets.xml`` will raise an exception. download (boolean, False): Whether to also download all the files referenced by the METS src_baseurl (string, None): Base URL for resolving relative file locations Download (clone) :py:attr:`mets_url` to ``mets.xml`` in :py:attr:`dst_dir`, unless the former is already local and the latter is ``none`` or already identical to its directory name. Returns: a new :py:class:`~ocrd.workspace.Workspace` """ log = getLogger('ocrd.resolver.workspace_from_url') if mets_url is None: raise ValueError("Must pass 'mets_url' workspace_from_url") # if mets_url is a relative filename, make it absolute if is_local_filename(mets_url) and not Path(mets_url).is_absolute(): mets_url = str(Path(Path.cwd() / mets_url)) # if mets_basename is not given, use the last URL segment of the mets_url if mets_basename is None: mets_basename = nth_url_segment(mets_url, -1) # If src_baseurl wasn't given, determine from mets_url by removing last url segment if not src_baseurl: last_segment = nth_url_segment(mets_url) src_baseurl = remove_non_path_from_url( remove_non_path_from_url(mets_url)[:-len(last_segment)]) # resolve dst_dir if not dst_dir: if is_local_filename(mets_url): log.debug("Deriving dst_dir %s from %s", Path(mets_url).parent, mets_url) dst_dir = Path(mets_url).parent else: log.debug("Creating ephemeral workspace '%s' for METS @ <%s>", dst_dir, mets_url) dst_dir = mkdtemp(prefix=TMP_PREFIX) # XXX Path.resolve is always strict in Python <= 3.5, so create dst_dir unless it exists consistently if not Path(dst_dir).exists(): Path(dst_dir).mkdir(parents=True, exist_ok=False) dst_dir = str(Path(dst_dir).resolve()) log.debug( "workspace_from_url\nmets_basename='%s'\nmets_url='%s'\nsrc_baseurl='%s'\ndst_dir='%s'", mets_basename, mets_url, src_baseurl, dst_dir) self.download_to_directory( dst_dir, mets_url, basename=mets_basename, if_exists='overwrite' if clobber_mets else 'skip') workspace = Workspace(self, dst_dir, mets_basename=mets_basename, baseurl=src_baseurl) if download: for f in workspace.mets.find_files(): workspace.download_file(f) return workspace