Beispiel #1
0
def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, force,
                       local_filename):
    """
    Add a file LOCAL_FILENAME to METS in a workspace.
    """
    workspace = Workspace(ctx.resolver,
                          directory=ctx.directory,
                          mets_basename=ctx.mets_basename,
                          automatic_backup=ctx.automatic_backup)

    if not local_filename.startswith(ctx.directory):
        log.debug("File '%s' is not in workspace, copying", local_filename)
        local_filename = ctx.resolver.download_to_directory(ctx.directory,
                                                            "file://" +
                                                            local_filename,
                                                            subdir=file_grp)

    url = "file://" + local_filename

    workspace.mets.add_file(fileGrp=file_grp,
                            ID=file_id,
                            mimetype=mimetype,
                            url=url,
                            pageId=page_id,
                            force=force,
                            local_filename=local_filename)
    workspace.save_mets()
Beispiel #2
0
def set_id(ctx, id):
    workspace = Workspace(ctx.resolver,
                          directory=ctx.directory,
                          mets_basename=ctx.mets_basename,
                          automatic_backup=ctx.automatic_backup)
    workspace.mets.unique_identifier = id
    workspace.save_mets()
Beispiel #3
0
def do_the_update(bagdir, non_local_urls=False):
    directory = Path(bagdir, 'data')
    if not Path(directory, 'mets.xml').exists():
        LOG.error("Something's wrong with OCRD-ZIP at %s, no data/mets.xml!",
                  bagdir)
        return
    workspace = Workspace(resolver, directory=str(directory))
    with pushd_popd(directory):
        for f in workspace.mets.find_files():
            fp = Path(f.url)
            if not fp.exists() and not non_local_urls:
                LOG.debug("Skipping non-local file: %s", fp)
                continue
            ext = MIME_TO_EXT.get(f.mimetype)
            if not ext:
                LOG.error(
                    "No rule to translate '%s' to an extension. Skipping %s",
                    f.mimetype, fp)
                continue
            if fp.suffix == ext:
                LOG.debug("Already has the right extension, %s", fp.name)
                continue
            if fp.suffix and fp.suffix in EXT_TO_MIME and fp.suffix != ext:
                LOG.warning("Has the WRONG extension, is '%s' should be '%s'",
                            fp.suffix, ext)
                f.url = f.url[:-len(fp.suffix)]
            LOG.info('Renaming %s{,%s}', fp, ext)
            f.url = "%s%s" % (f.url, ext)
            if fp.exists():
                fp.rename('%s%s' % (fp, ext))
        workspace.save_mets()
        LOG.debug('Running bagit update script')
        update_checksums(bagdir)
    LOG.info("FINISHED: %s", bagdir)
Beispiel #4
0
def set_id(ctx, id):  # pylint: disable=redefined-builtin
    workspace = Workspace(ctx.resolver,
                          directory=ctx.directory,
                          mets_basename=ctx.mets_basename,
                          automatic_backup=ctx.automatic_backup)
    workspace.mets.unique_identifier = id
    workspace.save_mets()
Beispiel #5
0
def prune_files(ctx, file_grp, mimetype, page_id, file_id):
    """
    Removes mets:files that point to non-existing local files

    (If any ``FILTER`` starts with ``//``, then its remainder
     will be interpreted as a regular expression.)
    """
    workspace = Workspace(ctx.resolver,
                          directory=ctx.directory,
                          mets_basename=basename(ctx.mets_url),
                          automatic_backup=ctx.automatic_backup)
    with pushd_popd(workspace.directory):
        for f in workspace.mets.find_files(
                ID=file_id,
                fileGrp=file_grp,
                mimetype=mimetype,
                pageId=page_id,
        ):
            try:
                if not f.local_filename or not exists(f.local_filename):
                    workspace.mets.remove_file(f.ID)
            except Exception as e:
                ctx.log.exception("Error removing %f: %s", f, e)
                raise (e)
        workspace.save_mets()
Beispiel #6
0
def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore,
                       check_file_exists, force, fname):
    """
    Add a file or http(s) URL FNAME to METS in a workspace.
    If FNAME is not an http(s) URL and is not a workspace-local existing file, try to copy to workspace.
    """
    workspace = Workspace(ctx.resolver,
                          directory=ctx.directory,
                          mets_basename=ctx.mets_basename,
                          automatic_backup=ctx.automatic_backup)

    log = getLogger('ocrd.cli.workspace.add')
    if not mimetype:
        try:
            mimetype = EXT_TO_MIME[Path(fname).suffix]
            log.info("Guessed mimetype to be %s" % mimetype)
        except KeyError:
            log.error(
                "Cannot guess mimetype from extension '%s' for '%s'. Set --mimetype explicitly"
                % (Path(fname).suffix, fname))

    kwargs = {
        'fileGrp': file_grp,
        'ID': file_id,
        'mimetype': mimetype,
        'pageId': page_id,
        'force': force,
        'ignore': ignore
    }
    log.debug("Adding '%s' (%s)", fname, kwargs)
    if not (fname.startswith('http://') or fname.startswith('https://')):
        if not fname.startswith(ctx.directory):
            if not isabs(fname) and exists(join(ctx.directory, fname)):
                fname = join(ctx.directory, fname)
            else:
                log.debug("File '%s' is not in workspace, copying", fname)
                try:
                    fname = ctx.resolver.download_to_directory(ctx.directory,
                                                               fname,
                                                               subdir=file_grp)
                except FileNotFoundError:
                    if check_file_exists:
                        log.error("File '%s' does not exist, halt execution!" %
                                  fname)
                        sys.exit(1)
        if check_file_exists and not exists(fname):
            log.error("File '%s' does not exist, halt execution!" % fname)
            sys.exit(1)
        if fname.startswith(ctx.directory):
            fname = relpath(fname, ctx.directory)
        kwargs['local_filename'] = fname

    kwargs['url'] = fname
    if not page_id:
        log.warning(
            "You did not provide '--page-id/-g', so the file you added is not linked to a specific page."
        )
    workspace.mets.add_file(**kwargs)
    workspace.save_mets()
Beispiel #7
0
def workspace_add_file(ctx, file_grp, file_id, mimetype, group_id,
                       local_filename):
    workspace = Workspace(ctx.resolver, directory=ctx.directory)
    workspace.mets.add_file(file_grp=file_grp,
                            file_id=file_id,
                            mimetype=mimetype,
                            group_id=group_id,
                            local_filename=local_filename)
    workspace.save_mets()
Beispiel #8
0
def set_id(ctx, id):   # pylint: disable=redefined-builtin
    """
    Set METS ID.

    If one of the supported identifier mechanisms is used, will set this identifier.

    Otherwise will create a new <mods:identifier type="purl">{{ ID }}</mods:identifier>.
    """
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url), automatic_backup=ctx.automatic_backup)
    workspace.mets.unique_identifier = id
    workspace.save_mets()
Beispiel #9
0
def prune_files(ctx):
    workspace = Workspace(ctx.resolver,
                          directory=ctx.directory,
                          mets_basename=ctx.mets_basename)
    with pushd_popd(workspace.directory):
        for f in workspace.mets.find_files():
            try:
                if not f.local_filename or not exists(f.local_filename):
                    workspace.mets.remove_file(f.ID)
            except Exception as e:
                log.exception("Error removing %f: %s", f, e)
                raise (e)
        workspace.save_mets()
 def test_crop(self):
     with copy_of_directory(assets.path_to('dfki-testdata/data')) as wsdir:
         ws = Workspace(self.resolver, wsdir)
         pagexml_before = len(ws.mets.find_files(mimetype=MIMETYPE_PAGE))
         run_processor(
             OcrdAnybaseocrCropper,
             resolver=self.resolver,
             mets_url=str(Path(wsdir, 'mets.xml')),
             input_file_grp='BIN',
             output_file_grp='CROP-TEST',
             parameter={},
         )
         ws.reload_mets()
         pagexml_after = len(ws.mets.find_files(mimetype=MIMETYPE_PAGE))
         self.assertEqual(pagexml_after, pagexml_before + 1)
 def test_crop(self):
     if not torch.cuda.is_available():
         pytest.skip('CUDA is not available, cannot test dewarping')
     with copy_of_directory(assets.path_to('dfki-testdata/data')) as wsdir:
         ws = Workspace(self.resolver, wsdir)
         pagexml_before = len(ws.mets.find_files(mimetype=MIMETYPE_PAGE))
         run_processor(OcrdAnybaseocrDewarper,
                       resolver=self.resolver,
                       mets_url=str(Path(wsdir, 'mets.xml')),
                       input_file_grp='BIN',
                       output_file_grp='DEWARP-TEST',
                       parameter={'model_path': str(self.model_path)})
         ws.reload_mets()
         pagexml_after = len(ws.mets.find_files(mimetype=MIMETYPE_PAGE))
         self.assertEqual(pagexml_after, pagexml_before + 1)
Beispiel #12
0
 def test_copies_ok(self):
     with copy_of_directory(assets.url_of('SBB0000F29300010000/data')) as wsdir:
         workspace = Workspace(Resolver(), wsdir)
         input_files = workspace.mets.find_files(fileGrp='OCR-D-IMG')
         self.assertEqual(len(input_files), 3)
         output_files = workspace.mets.find_files(fileGrp='OUTPUT')
         self.assertEqual(len(output_files), 0)
         run_processor(
             DummyProcessor,
             input_file_grp='OCR-D-IMG',
             output_file_grp='OUTPUT',
             workspace=workspace
         )
         output_files = workspace.mets.find_files(fileGrp='OUTPUT')
         output_files.sort(key=lambda x: x.url)
         print([str(s) for s in output_files])
         self.assertEqual(output_files[0].url, 'OUTPUT/OUTPUT_0001.tif')
         self.assertEqual(output_files[1].url, 'OUTPUT/OUTPUT_0001.xml')
         self.assertEqual(page_from_file(output_files[1]).pcGtsId, output_files[1].ID)
         self.assertEqual(page_from_file(output_files[1]).get_Page().imageFilename, output_files[0].url)
         self.assertEqual(len(output_files), 6)
         self.assertEqual(len(workspace.mets.find_files(ID='//OUTPUT.*')), 6)
         self.assertEqual(len(workspace.mets.find_files(ID='//OUTPUT.*_PAGE')), 3)
         self.assertEqual(len(workspace.mets.find_files(fileGrp='OUTPUT', mimetype=MIMETYPE_PAGE)), 3)
         run_processor(
             DummyProcessor,
             input_file_grp='OUTPUT',
             output_file_grp='OUTPUT2',
             workspace=workspace
         )
         output2_files = workspace.mets.find_files(fileGrp='OUTPUT2')
         output2_files.sort(key=lambda x: x.url)
         self.assertEqual(len(output2_files), 3)
Beispiel #13
0
def workspace_backup_list(ctx):
    """
    List backups
    """
    backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url), automatic_backup=ctx.automatic_backup))
    for b in backup_manager.list():
        print(b)
Beispiel #14
0
def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore,
                       check_file_exists, force, fname):
    """
    Add a file or http(s) URL FNAME to METS in a workspace.
    If FNAME is not an http(s) URL and is not a workspace-local existing file, try to copy to workspace.
    """
    workspace = Workspace(ctx.resolver,
                          directory=ctx.directory,
                          mets_basename=basename(ctx.mets_url),
                          automatic_backup=ctx.automatic_backup)

    kwargs = {
        'fileGrp': file_grp,
        'ID': file_id,
        'mimetype': mimetype,
        'pageId': page_id,
        'force': force,
        'ignore': ignore
    }
    log = getLogger('ocrd.cli.workspace.add')
    log.debug("Adding '%s' (%s)", fname, kwargs)
    if not (fname.startswith('http://') or fname.startswith('https://')):
        if not fname.startswith(ctx.directory):
            if not isabs(fname) and exists(join(ctx.directory, fname)):
                fname = join(ctx.directory, fname)
            else:
                log.debug("File '%s' is not in workspace, copying", fname)
                try:
                    fname = ctx.resolver.download_to_directory(ctx.directory,
                                                               fname,
                                                               subdir=file_grp)
                except FileNotFoundError:
                    if check_file_exists:
                        log.error("File '%s' does not exist, halt execution!" %
                                  fname)
                        sys.exit(1)
        if check_file_exists and not exists(fname):
            log.error("File '%s' does not exist, halt execution!" % fname)
            sys.exit(1)
        if fname.startswith(ctx.directory):
            fname = relpath(fname, ctx.directory)
        kwargs['local_filename'] = fname

    kwargs['url'] = fname
    workspace.mets.add_file(**kwargs)
    workspace.save_mets()
Beispiel #15
0
def list_pages(ctx):
    """
    List physical page IDs
    """
    workspace = Workspace(ctx.resolver,
                          directory=ctx.directory,
                          mets_basename=basename(ctx.mets_url))
    print("\n".join(workspace.mets.physical_pages))
Beispiel #16
0
def get_id(ctx):
    """
    Get METS id if any
    """
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url))
    ID = workspace.mets.unique_identifier
    if ID:
        print(ID)
Beispiel #17
0
def list_groups(ctx):
    """
    List fileGrp USE attributes
    """
    workspace = Workspace(ctx.resolver,
                          directory=ctx.directory,
                          mets_basename=basename(ctx.mets_url))
    print("\n".join(workspace.mets.file_groups))
Beispiel #18
0
def merge(ctx, copy_files, filegrp_mapping, file_grp, file_id, page_id,
          mimetype, mets_path):  # pylint: disable=redefined-builtin
    """
    Merges this workspace with the workspace that contains ``METS_PATH``

    The ``--file-id``, ``--page-id``, ``--mimetype`` and ``--file-grp`` options have
    the same semantics as in ``ocrd workspace find``, see ``ocrd workspace find --help``
    for an explanation.
    """
    mets_path = Path(mets_path)
    if filegrp_mapping:
        filegrp_mapping = loads(filegrp_mapping)
    workspace = Workspace(ctx.resolver,
                          directory=ctx.directory,
                          mets_basename=ctx.mets_basename,
                          automatic_backup=ctx.automatic_backup)
    other_workspace = Workspace(ctx.resolver,
                                directory=str(mets_path.parent),
                                mets_basename=str(mets_path.name))
    workspace.merge(
        other_workspace,
        copy_files=copy_files,
        fileGrp_mapping=filegrp_mapping,
        fileGrp=file_grp,
        ID=file_id,
        pageId=page_id,
        mimetype=mimetype,
    )
    workspace.save_mets()
Beispiel #19
0
 def setUp(self):
     self.resolver = Resolver()
     self.tempdir = mkdtemp()
     self.workspace_dir = join(self.tempdir, 'kant_aufklaerung_1784')
     copytree(assets.path_to('kant_aufklaerung_1784/data'),
              self.workspace_dir)
     self.workspace = Workspace(self.resolver,
                                directory=join(self.workspace_dir))
     self.mgr = WorkspaceBackupManager(self.workspace)
Beispiel #20
0
def workspace_backup_undo(ctx):
    """
    Restore the last backup
    """
    backup_manager = WorkspaceBackupManager(
        Workspace(ctx.resolver,
                  directory=ctx.directory,
                  mets_basename=basename(ctx.mets_url),
                  automatic_backup=ctx.automatic_backup))
    backup_manager.undo()
Beispiel #21
0
def workspace_backup_restore(ctx, choose_first, bak):
    """
    Restore backup BAK
    """
    backup_manager = WorkspaceBackupManager(
        Workspace(ctx.resolver,
                  directory=ctx.directory,
                  mets_basename=basename(ctx.mets_url),
                  automatic_backup=ctx.automatic_backup))
    backup_manager.restore(bak, choose_first)
Beispiel #22
0
def workspace_backup_add(ctx):
    """
    Create a new backup
    """
    backup_manager = WorkspaceBackupManager(
        Workspace(ctx.resolver,
                  directory=ctx.directory,
                  mets_basename=basename(ctx.mets_url),
                  automatic_backup=ctx.automatic_backup))
    backup_manager.add()
Beispiel #23
0
def validate_process(tasks, workspace):
    '''
    Validate a sequence of tasks passable to 'ocrd process'
    '''
    if workspace:
        _inform_of_result(
            validate_tasks([ProcessorTask.parse(t) for t in tasks],
                           Workspace(Resolver(), directory=workspace)))
    else:
        for t in [ProcessorTask.parse(t) for t in tasks]:
            _inform_of_result(t.validate())
Beispiel #24
0
def rename_group(ctx, old, new):
    """
    Rename fileGrp (USE attribute ``NEW`` to ``OLD``).
    """
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url))
    workspace.rename_file_group(old, new)
    workspace.save_mets()
Beispiel #25
0
def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, download):
    """
    Find files.

    (If any ``FILTER`` starts with ``//``, then its remainder
     will be interpreted as a regular expression.)
    """
    modified_mets = False
    ret = list()
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url))
    for f in workspace.mets.find_files(
            ID=file_id,
            fileGrp=file_grp,
            mimetype=mimetype,
            pageId=page_id,
        ):
        if download and not f.local_filename:
            workspace.download_file(f)
            modified_mets = True
        ret.append([f.ID if field == 'pageId' else getattr(f, field) or ''
                    for field in output_field])
    if modified_mets:
        workspace.save_mets()
    if 'pageId' in output_field:
        idx = output_field.index('pageId')
        fileIds = list(map(lambda fields: fields[idx], ret))
        pages = workspace.mets.get_physical_pages(for_fileIds=fileIds)
        for fields, page in zip(ret, pages):
            fields[idx] = page or ''
    for fields in ret:
        print('\t'.join(fields))
Beispiel #26
0
def remove_group(ctx, group, recursive, force):
    workspace = Workspace(ctx.resolver,
                          directory=ctx.directory,
                          mets_basename=ctx.mets_basename)
    for g in group:
        workspace.remove_file_group(g, recursive, force)
    workspace.save_mets()
Beispiel #27
0
def workspace_remove_file(ctx, id, force):  # pylint: disable=redefined-builtin
    """
    Delete file by ID from mets.xml
    """
    workspace = Workspace(ctx.resolver,
                          directory=ctx.directory,
                          mets_basename=ctx.mets_basename,
                          automatic_backup=ctx.automatic_backup)
    for i in id:
        workspace.remove_file(i, force=force)
    workspace.save_mets()
Beispiel #28
0
def remove_group(ctx, group, recursive, force, keep_files):
    """
    Delete fileGrps (given by their USE attribute ``GROUP``).
    
    (If any ``GROUP`` starts with ``//``, then its remainder
     will be interpreted as a regular expression.)
    """
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url))
    for g in group:
        workspace.remove_file_group(g, recursive=recursive, force=force, keep_files=keep_files)
    workspace.save_mets()
Beispiel #29
0
def workspace_remove_file(ctx, id, force, keep_file):  # pylint: disable=redefined-builtin
    """
    Delete files (given by their ID attribute ``ID``).
    
    (If any ``ID`` starts with ``//``, then its remainder
     will be interpreted as a regular expression.)
    """
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url), automatic_backup=ctx.automatic_backup)
    for i in id:
        workspace.remove_file(i, force=force, keep_file=keep_file)
    workspace.save_mets()
Beispiel #30
0
def workspace_find(ctx, file_grp, local_only, mimetype, group_id, file_id,
                   output_field, download):
    """
    Find files.
    """
    workspace = Workspace(ctx.resolver, directory=ctx.directory)
    for f in workspace.mets.find_files(
            ID=file_id,
            fileGrp=file_grp,
            local_only=local_only,
            mimetype=mimetype,
            groupId=group_id,
    ):
        if download:
            workspace.download_file(f, subdir=f.fileGrp)
            workspace.save_mets()
        ret = '\t'.join([getattr(f, field) or '' for field in output_field])
        print(ret)