def workspace_from_url(self, mets_url, dst_dir=None, clobber_mets=False, mets_basename=None, download=False, src_baseurl=None): """ Create a workspace from a METS by URL (i.e. clone it). Sets the mets.xml file Arguments: mets_url (string): Source mets URL dst_dir (string, None): Target directory for the workspace clobber_mets (boolean, False): Whether to overwrite existing mets.xml. By default existing mets.xml will raise an exception. download (boolean, False): Whether to download all the files src_baseurl (string, None): Base URL for resolving relative file locations Returns: Workspace """ if mets_url is None: raise ValueError("Must pass 'mets_url' workspace_from_url") # if mets_url is a relative filename, make it absolute if is_local_filename(mets_url) and not Path(mets_url).is_absolute(): mets_url = str(Path(Path.cwd() / mets_url)) # if mets_basename is not given, use the last URL segment of the mets_url if mets_basename is None: mets_basename = nth_url_segment(mets_url, -1) # If src_baseurl wasn't given, determine from mets_url by removing last url segment if not src_baseurl: last_segment = nth_url_segment(mets_url) src_baseurl = remove_non_path_from_url(remove_non_path_from_url(mets_url)[:-len(last_segment)]) # resolve dst_dir if not dst_dir: if is_local_filename(mets_url): log.debug("Deriving dst_dir %s from %s", Path(mets_url).parent, mets_url) dst_dir = Path(mets_url).parent else: log.debug("Creating ephemeral workspace '%s' for METS @ <%s>", dst_dir, mets_url) dst_dir = tempfile.mkdtemp(prefix=TMP_PREFIX) # XXX Path.resolve is always strict in Python <= 3.5, so create dst_dir unless it exists consistently if not Path(dst_dir).exists(): Path(dst_dir).mkdir(parents=True, exist_ok=False) dst_dir = str(Path(dst_dir).resolve()) log.debug("workspace_from_url\nmets_basename='%s'\nmets_url='%s'\nsrc_baseurl='%s'\ndst_dir='%s'", mets_basename, mets_url, src_baseurl, dst_dir) self.download_to_directory(dst_dir, mets_url, basename=mets_basename, if_exists='overwrite' if clobber_mets else 'raise') workspace = Workspace(self, dst_dir, mets_basename=mets_basename, baseurl=src_baseurl) if download: for f in workspace.mets.find_files(): workspace.download_file(f) return workspace
def download_to_directory(self, directory, url, basename=None, if_exists='skip', subdir=None): """ Download a file to a directory. Early Shortcut: If url is a local file and that file is already in the directory, keep it there. If basename is not given but subdir is, assume user knows what she's doing and use last URL segment as the basename. If basename is not given and no subdir is given, use the alnum characters in the URL as the basename. Args: directory (string): Directory to download files to basename (string, None): basename part of the filename on disk. url (string): URL to download from if_exists (string, "skip"): What to do if target file already exists. One of ``skip`` (default), ``overwrite`` or ``raise`` subdir (string, None): Subdirectory to create within the directory. Think fileGrp. Returns: Local filename, __relative__ to directory """ log = getLogger('ocrd.resolver.download_to_directory') # pylint: disable=redefined-outer-name log.debug("directory=|%s| url=|%s| basename=|%s| if_exists=|%s| subdir=|%s|", directory, url, basename, if_exists, subdir) if not url: raise Exception("'url' must be a string") if not directory: raise Exception("'directory' must be a string") # acutally Path would also work directory = Path(directory) directory.mkdir(parents=True, exist_ok=True) directory = str(directory.resolve()) subdir_path = Path(subdir if subdir else '') basename_path = Path(basename if basename else nth_url_segment(url)) ret = str(Path(subdir_path, basename_path)) dst_path = Path(directory, ret) # log.info("\n\tdst_path='%s \n\turl=%s", dst_path, url) # print('url=%s', url) # print('directory=%s', directory) # print('subdir_path=%s', subdir_path) # print('basename_path=%s', basename_path) # print('ret=%s', ret) # print('dst_path=%s', dst_path) src_path = None if is_local_filename(url): try: # XXX this raises FNFE in Python 3.5 if src_path doesn't exist but not 3.6+ src_path = Path(get_local_filename(url)).resolve() except FileNotFoundError as e: log.error("Failed to resolve URL locally: %s --> '%s' which does not exist" % (url, src_path)) raise e if not src_path.exists(): raise FileNotFoundError("File path passed as 'url' to download_to_directory does not exist: %s" % url) if src_path == dst_path: log.debug("Stop early, src_path and dst_path are the same: '%s' (url: '%s')" % (src_path, url)) return ret # Respect 'if_exists' arg if dst_path.exists(): if if_exists == 'skip': return ret if if_exists == 'raise': raise FileExistsError("File already exists and if_exists == 'raise': %s" % (dst_path)) # Create dst_path parent dir dst_path.parent.mkdir(parents=True, exist_ok=True) # Copy files or download remote assets if src_path: log.debug("Copying file '%s' to '%s'", src_path, dst_path) dst_path.write_bytes(src_path.read_bytes()) else: log.debug("Downloading URL '%s' to '%s'", url, dst_path) response = requests.get(url) if response.status_code != 200: raise Exception("HTTP request failed: %s (HTTP %d)" % (url, response.status_code)) dst_path.write_bytes(response.content) return ret
def test_nth_url_segment(self): self.assertEqual(nth_url_segment(''), '') self.assertEqual(nth_url_segment('foo'), 'foo') self.assertEqual(nth_url_segment('foo', n=-1), 'foo') self.assertEqual(nth_url_segment('foo', n=-2), '') self.assertEqual(nth_url_segment('foo/bar', n=-2), 'foo') self.assertEqual(nth_url_segment('/baz/bar', n=-2), 'baz') self.assertEqual(nth_url_segment('foo/'), 'foo') self.assertEqual(nth_url_segment('foo//?bar#frag'), 'foo') self.assertEqual(nth_url_segment('/path/to/foo#frag'), 'foo') self.assertEqual(nth_url_segment('/path/to/foo#frag', n=-2), 'to') self.assertEqual(nth_url_segment('https://server/foo?xyz=zyx'), 'foo')
def workspace_from_url(self, mets_url, dst_dir=None, clobber_mets=False, mets_basename=None, download=False, src_baseurl=None): """ Create a workspace from a METS by URL (i.e. clone if :py:attr:`mets_url` is remote or :py:attr:`dst_dir` is given). Arguments: mets_url (string): Source METS URL or filesystem path Keyword Arguments: dst_dir (string, None): Target directory for the workspace. \ By default create a temporary directory under :py:data:`ocrd.constants.TMP_PREFIX`. \ (The resulting path can be retrieved via :py:attr:`ocrd.Workspace.directory`.) clobber_mets (boolean, False): Whether to overwrite existing ``mets.xml``. \ By default existing ``mets.xml`` will raise an exception. download (boolean, False): Whether to also download all the files referenced by the METS src_baseurl (string, None): Base URL for resolving relative file locations Download (clone) :py:attr:`mets_url` to ``mets.xml`` in :py:attr:`dst_dir`, unless the former is already local and the latter is ``none`` or already identical to its directory name. Returns: a new :py:class:`~ocrd.workspace.Workspace` """ log = getLogger('ocrd.resolver.workspace_from_url') if mets_url is None: raise ValueError("Must pass 'mets_url' workspace_from_url") # if mets_url is a relative filename, make it absolute if is_local_filename(mets_url) and not Path(mets_url).is_absolute(): mets_url = str(Path(Path.cwd() / mets_url)) # if mets_basename is not given, use the last URL segment of the mets_url if mets_basename is None: mets_basename = nth_url_segment(mets_url, -1) # If src_baseurl wasn't given, determine from mets_url by removing last url segment if not src_baseurl: last_segment = nth_url_segment(mets_url) src_baseurl = remove_non_path_from_url( remove_non_path_from_url(mets_url)[:-len(last_segment)]) # resolve dst_dir if not dst_dir: if is_local_filename(mets_url): log.debug("Deriving dst_dir %s from %s", Path(mets_url).parent, mets_url) dst_dir = Path(mets_url).parent else: log.debug("Creating ephemeral workspace '%s' for METS @ <%s>", dst_dir, mets_url) dst_dir = mkdtemp(prefix=TMP_PREFIX) # XXX Path.resolve is always strict in Python <= 3.5, so create dst_dir unless it exists consistently if not Path(dst_dir).exists(): Path(dst_dir).mkdir(parents=True, exist_ok=False) dst_dir = str(Path(dst_dir).resolve()) log.debug( "workspace_from_url\nmets_basename='%s'\nmets_url='%s'\nsrc_baseurl='%s'\ndst_dir='%s'", mets_basename, mets_url, src_baseurl, dst_dir) self.download_to_directory( dst_dir, mets_url, basename=mets_basename, if_exists='overwrite' if clobber_mets else 'skip') workspace = Workspace(self, dst_dir, mets_basename=mets_basename, baseurl=src_baseurl) if download: for f in workspace.mets.find_files(): workspace.download_file(f) return workspace