コード例 #1
0
ファイル: test_workspace.py プロジェクト: hefv57/core
    def test_superfluous_copies_in_ws_dir(self):
        """
        https://github.com/OCR-D/core/issues/227
        """
        def find_recursive(root):
            ret = []
            for _, _, f in walk(root):
                for file in f:
                    ret.append(file)
            return ret

        with TemporaryDirectory() as wsdir:
            with open(
                    assets.path_to(
                        'SBB0000F29300010000/data/mets_one_file.xml'),
                    'r') as f_in:
                with open(join(wsdir, 'mets.xml'), 'w') as f_out:
                    f_out.write(f_in.read())
            self.assertEqual(len(find_recursive(wsdir)), 1)
            ws1 = Workspace(self.resolver, wsdir)
            for file in ws1.mets.find_files():
                ws1.download_file(file)
            self.assertEqual(len(find_recursive(wsdir)), 2)
            self.assertTrue(
                exists(join(wsdir, 'OCR-D-IMG/FILE_0005_IMAGE.tif')))
コード例 #2
0
ファイル: resolver.py プロジェクト: mjenckel/core
    def workspace_from_url(self, mets_url, dst_dir=None, clobber_mets=False, mets_basename=None, download=False, src_baseurl=None):
        """
        Create a workspace from a METS by URL (i.e. clone it).

        Sets the mets.xml file

        Arguments:
            mets_url (string): Source mets URL
            dst_dir (string, None): Target directory for the workspace
            clobber_mets (boolean, False): Whether to overwrite existing mets.xml. By default existing mets.xml will raise an exception.
            download (boolean, False): Whether to download all the files
            src_baseurl (string, None): Base URL for resolving relative file locations

        Returns:
            Workspace
        """

        if mets_url is None:
            raise ValueError("Must pass 'mets_url' workspace_from_url")

        # if mets_url is a relative filename, make it absolute
        if is_local_filename(mets_url) and not Path(mets_url).is_absolute():
            mets_url = str(Path(Path.cwd() / mets_url))

        # if mets_basename is not given, use the last URL segment of the mets_url
        if mets_basename is None:
            mets_basename = nth_url_segment(mets_url, -1)

        # If src_baseurl wasn't given, determine from mets_url by removing last url segment
        if not src_baseurl:
            last_segment = nth_url_segment(mets_url)
            src_baseurl = remove_non_path_from_url(remove_non_path_from_url(mets_url)[:-len(last_segment)])

        # resolve dst_dir
        if not dst_dir:
            if is_local_filename(mets_url):
                log.debug("Deriving dst_dir %s from %s", Path(mets_url).parent, mets_url)
                dst_dir = Path(mets_url).parent
            else:
                log.debug("Creating ephemeral workspace '%s' for METS @ <%s>", dst_dir, mets_url)
                dst_dir = tempfile.mkdtemp(prefix=TMP_PREFIX)
        # XXX Path.resolve is always strict in Python <= 3.5, so create dst_dir unless it exists consistently
        if not Path(dst_dir).exists():
            Path(dst_dir).mkdir(parents=True, exist_ok=False)
        dst_dir = str(Path(dst_dir).resolve())

        log.debug("workspace_from_url\nmets_basename='%s'\nmets_url='%s'\nsrc_baseurl='%s'\ndst_dir='%s'",
            mets_basename, mets_url, src_baseurl, dst_dir)

        self.download_to_directory(dst_dir, mets_url, basename=mets_basename, if_exists='overwrite' if clobber_mets else 'raise')

        workspace = Workspace(self, dst_dir, mets_basename=mets_basename, baseurl=src_baseurl)

        if download:
            for f in workspace.mets.find_files():
                workspace.download_file(f)

        return workspace
コード例 #3
0
    def workspace_from_url(self, mets_url, directory=None, clobber_mets=False, mets_basename=None, download=False, download_local=False):
        """
        Create a workspace from a METS by URL.

        Sets the mets.xml file
        """
        if directory is not None and not directory.startswith('/'):
            directory = os.path.abspath(directory)

        if mets_url is None:
            if directory is None:
                raise Exception("Must pass mets_url and/or directory to workspace_from_url")
            else:
                mets_url = 'file://%s/%s' % (directory, mets_basename)
        if mets_url.find('://') == -1:
            # resolve to absolute
            mets_url = os.path.abspath(mets_url)
            mets_url = 'file://' + mets_url
        if directory is None:
            # if mets_url is a file-url assume working directory to be  where
            # the mets.xml resides
            if mets_url.startswith('file://'):
                # if directory was not given and mets_url is a file assume that
                # directory should be the directory where the mets.xml resides
                directory = os.path.dirname(mets_url[len('file://'):])
            else:
                directory = tempfile.mkdtemp(prefix=TMP_PREFIX)
                log.debug("Creating workspace '%s' for METS @ <%s>", directory, mets_url)

        # if mets_basename is not given, use the last URL segment of the mets_url
        if mets_basename is None:
            mets_basename = mets_url \
                .rsplit('/', 1)[-1] \
                .split('?')[0] \
                .split('#')[0]

        mets_fpath = os.path.join(directory, mets_basename)
        log.debug("Copying mets url '%s' to '%s'", mets_url, mets_fpath)
        if 'file://' + mets_fpath == mets_url:
            log.debug("Target and source mets are identical")
        else:
            if os.path.exists(mets_fpath) and not clobber_mets:
                raise Exception("File '%s' already exists but clobber_mets is false" % mets_fpath)
            else:
                self.download_to_directory(directory, mets_url, basename=mets_basename)

        workspace = Workspace(self, directory, mets_basename=mets_basename)

        if download_local or download:
            for file_grp in workspace.mets.file_groups:
                if download_local:
                    for f in workspace.mets.find_files(fileGrp=file_grp, local_only=True):
                        workspace.download_file(f, subdir=file_grp)
                else:
                    workspace.download_files_in_group(file_grp)

        return workspace
コード例 #4
0
def test_superfluous_copies_in_ws_dir(tmp_path):
    """
    https://github.com/OCR-D/core/issues/227
    """
    # arrange
    src_path = assets.path_to('SBB0000F29300010000/data/mets_one_file.xml')
    dst_path = join(tmp_path, 'mets.xml')
    copyfile(src_path, dst_path)
    ws1 = Workspace(Resolver(), tmp_path)

    # assert directory files
    assert count_files(tmp_path) == 1

    # act
    for file in ws1.mets.find_all_files():
        ws1.download_file(file)

    # assert
    assert count_files(tmp_path) == 2
    assert exists(join(tmp_path, 'OCR-D-IMG/FILE_0005_IMAGE.tif'))
コード例 #5
0
    def test_227_1(self):
        def find_recursive(root):
            ret = []
            for _, _, f in os.walk(root):
                for file in f:
                    ret.append(file)
            return ret

        with TemporaryDirectory() as wsdir:
            with open(
                    assets.path_to(
                        'SBB0000F29300010000/data/mets_one_file.xml'),
                    'r') as f_in:
                with open(join(wsdir, 'mets.xml'), 'w') as f_out:
                    f_out.write(f_in.read())
            self.assertEqual(len(find_recursive(wsdir)), 1)
            ws1 = Workspace(self.resolver, wsdir)
            for file in ws1.mets.find_files():
                ws1.download_file(file)
            self.assertEqual(len(find_recursive(wsdir)), 2)
            self.assertTrue(exists(join(wsdir, 'OCR-D-IMG/FILE_0005_IMAGE')))
コード例 #6
0
    def workspace_from_url(self,
                           mets_url,
                           dst_dir=None,
                           clobber_mets=False,
                           mets_basename=None,
                           download=False,
                           src_baseurl=None):
        """
        Create a workspace from a METS by URL (i.e. clone if :py:attr:`mets_url` is remote or :py:attr:`dst_dir` is given).

        Arguments:
            mets_url (string): Source METS URL or filesystem path
        Keyword Arguments:
            dst_dir (string, None): Target directory for the workspace. \
                By default create a temporary directory under :py:data:`ocrd.constants.TMP_PREFIX`. \
                (The resulting path can be retrieved via :py:attr:`ocrd.Workspace.directory`.)
            clobber_mets (boolean, False): Whether to overwrite existing ``mets.xml``. \
                By default existing ``mets.xml`` will raise an exception.
            download (boolean, False): Whether to also download all the files referenced by the METS
            src_baseurl (string, None): Base URL for resolving relative file locations

        Download (clone) :py:attr:`mets_url` to ``mets.xml`` in :py:attr:`dst_dir`, unless 
        the former is already local and the latter is ``none`` or already identical to its directory name.

        Returns:
            a new :py:class:`~ocrd.workspace.Workspace`
        """
        log = getLogger('ocrd.resolver.workspace_from_url')

        if mets_url is None:
            raise ValueError("Must pass 'mets_url' workspace_from_url")

        # if mets_url is a relative filename, make it absolute
        if is_local_filename(mets_url) and not Path(mets_url).is_absolute():
            mets_url = str(Path(Path.cwd() / mets_url))

        # if mets_basename is not given, use the last URL segment of the mets_url
        if mets_basename is None:
            mets_basename = nth_url_segment(mets_url, -1)

        # If src_baseurl wasn't given, determine from mets_url by removing last url segment
        if not src_baseurl:
            last_segment = nth_url_segment(mets_url)
            src_baseurl = remove_non_path_from_url(
                remove_non_path_from_url(mets_url)[:-len(last_segment)])

        # resolve dst_dir
        if not dst_dir:
            if is_local_filename(mets_url):
                log.debug("Deriving dst_dir %s from %s",
                          Path(mets_url).parent, mets_url)
                dst_dir = Path(mets_url).parent
            else:
                log.debug("Creating ephemeral workspace '%s' for METS @ <%s>",
                          dst_dir, mets_url)
                dst_dir = mkdtemp(prefix=TMP_PREFIX)
        # XXX Path.resolve is always strict in Python <= 3.5, so create dst_dir unless it exists consistently
        if not Path(dst_dir).exists():
            Path(dst_dir).mkdir(parents=True, exist_ok=False)
        dst_dir = str(Path(dst_dir).resolve())

        log.debug(
            "workspace_from_url\nmets_basename='%s'\nmets_url='%s'\nsrc_baseurl='%s'\ndst_dir='%s'",
            mets_basename, mets_url, src_baseurl, dst_dir)

        self.download_to_directory(
            dst_dir,
            mets_url,
            basename=mets_basename,
            if_exists='overwrite' if clobber_mets else 'skip')

        workspace = Workspace(self,
                              dst_dir,
                              mets_basename=mets_basename,
                              baseurl=src_baseurl)

        if download:
            for f in workspace.mets.find_files():
                workspace.download_file(f)

        return workspace
コード例 #7
0
    def workspace_from_url(self,
                           mets_url,
                           dst_dir=None,
                           clobber_mets=False,
                           mets_basename=None,
                           download=False,
                           baseurl=None):
        """
        Create a workspace from a METS by URL.

        Sets the mets.xml file

        Arguments:
            mets_url (string): Source mets URL
            dst_dir (string, None): Target directory for the workspace
            clobber_mets (boolean, False): Whether to overwrite existing mets.xml. By default existing mets.xml will raise an exception.
            download (boolean, False): Whether to download all the files
            baseurl (string, None): Base URL for resolving relative file locations

        Returns:
            Workspace
        """
        if dst_dir and not dst_dir.startswith('/'):
            dst_dir = abspath(dst_dir)

        if mets_url is None:
            if baseurl is None:
                raise Exception(
                    "Must pass mets_url and/or baseurl to workspace_from_url")
            else:
                mets_url = 'file://%s/%s' % (baseurl, mets_basename
                                             if mets_basename else 'mets.xml')
        if baseurl is None:
            baseurl = mets_url.rsplit('/', 1)[0]
        log.debug(
            "workspace_from_url\nmets_url='%s'\nbaseurl='%s'\ndst_dir='%s'",
            mets_url, baseurl, dst_dir)

        # resolve to absolute
        if '://' not in mets_url:
            mets_url = 'file://%s' % abspath(mets_url)

        if dst_dir is None:
            # if mets_url is a file-url assume working directory is source directory
            if mets_url.startswith('file://'):
                # if dst_dir was not given and mets_url is a file assume that
                # dst_dir should be the directory where the mets.xml resides
                dst_dir = dirname(mets_url[len('file://'):])
            else:
                dst_dir = tempfile.mkdtemp(prefix=TMP_PREFIX)
                log.debug("Creating workspace '%s' for METS @ <%s>", dst_dir,
                          mets_url)

        # if mets_basename is not given, use the last URL segment of the mets_url
        if mets_basename is None:
            mets_basename = mets_url \
                .rsplit('/', 1)[-1] \
                .split('?')[0] \
                .split('#')[0]

        dst_mets = join(dst_dir, mets_basename)
        log.debug("Copying mets url '%s' to '%s'", mets_url, dst_mets)
        if 'file://' + dst_mets == mets_url:
            log.debug("Target and source mets are identical")
        else:
            if exists(dst_mets) and not clobber_mets:
                raise Exception(
                    "File '%s' already exists but clobber_mets is false" %
                    dst_mets)
            else:
                self.download_to_directory(dst_dir,
                                           mets_url,
                                           basename=mets_basename)

        workspace = Workspace(self,
                              dst_dir,
                              mets_basename=mets_basename,
                              baseurl=baseurl)

        if download:
            for f in workspace.mets.find_files():
                workspace.download_file(f)

        return workspace