Example #1
0
 def test_overwrite(self):
     resolver = Resolver()
     with TemporaryDirectory() as tempdir:
         workspace = resolver.workspace_from_url(
             assets.path_to('kant_aufklaerung_1784/data/mets.xml'),
             dst_dir=tempdir)
         # should fail at step 3
         workspace.mets.add_file('OCR-D-SEG-WORD',
                                 url='foo/bar',
                                 ID='foo',
                                 pageId='page1',
                                 mimetype='image/tif')
         with self.assertRaisesRegex(
                 Exception,
                 r"Invalid task sequence input/output file groups: \[\"Output fileGrp\[@USE='OCR-D-SEG-WORD'\] already in METS!\"\]"
         ):
             validate_tasks([
                 ProcessorTask.parse(x) for x in [
                     "sample-processor -I OCR-D-IMG       -O OCR-D-SEG-BLOCK",
                     "sample-processor -I OCR-D-SEG-BLOCK -O OCR-D-SEG-LINE",
                     "sample-processor -I OCR-D-SEG-LINE  -O OCR-D-SEG-WORD",
                     "sample-processor -I OCR-D-SEG-WORD  -O OCR-D-OCR-TESS",
                 ]
             ], workspace)
         # should succeed b/c overwrite
         validate_tasks([
             ProcessorTask.parse(x) for x in [
                 "sample-processor -I OCR-D-IMG       -O OCR-D-SEG-BLOCK",
                 "sample-processor -I OCR-D-SEG-BLOCK -O OCR-D-SEG-LINE",
                 "sample-processor -I OCR-D-SEG-LINE  -O OCR-D-SEG-WORD",
                 "sample-processor -I OCR-D-SEG-WORD  -O OCR-D-OCR-TESS",
             ]
         ],
                        workspace,
                        overwrite=True)
 def test_validate_ocrd_file(self):
     resolver = Resolver()
     workspace = resolver.workspace_from_url(assets.url_of('glyph-consistency/data/mets.xml'))
     with pushd_popd(workspace.directory):
         ocrd_file = workspace.mets.find_all_files(ID="FAULTY_GLYPHS_FILE")[0]
         report = PageValidator.validate(ocrd_file=ocrd_file)
         self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 17, '17 textequiv consistency errors')
Example #3
0
    def test_validate_sequence(self):
        resolver = Resolver()
        with TemporaryDirectory() as tempdir:
            workspace = resolver.workspace_from_url(
                assets.path_to('kant_aufklaerung_1784/data/mets.xml'),
                dst_dir=tempdir)
            params_path = Path(tempdir, 'params.json')
            params_path.write_text('{"param1": true}')

            with self.assertRaisesRegex(
                    Exception,
                    "Input file group not contained in METS or produced by previous steps: FOO'"
            ):
                validate_tasks([
                    ProcessorTask.parse(x) for x in [
                        '%s -I OCR-D-IMG -O OUT1 -p %s' %
                        (SAMPLE_NAME_REQUIRED_PARAM, params_path),
                        '%s -I FOO -O OUT2 -p %s' %
                        (SAMPLE_NAME_REQUIRED_PARAM, params_path)
                    ]
                ], workspace)

            with self.assertRaisesRegex(
                    Exception, "Input fileGrp.@USE='IN'. not in METS!"):
                validate_tasks([
                    ProcessorTask.parse(x) for x in [
                        '%s -I IN -O OUT1 -p %s' %
                        (SAMPLE_NAME_REQUIRED_PARAM, params_path),
                    ]
                ], workspace)
Example #4
0
def bashlib_input_files(**kwargs):
    """
    List input files for processing

    Instantiate a processor and workspace from the given processing options.
    Then loop through the input files of the input fileGrp, and for each one,
    print its `url`, `ID`, `mimetype` and `pageId`, as well as its recommended
    `outputFileId` (from ``make_file_id``).

    (The printing format is one associative array initializer per line.)
    """
    initLogging()
    mets = kwargs.pop('mets')
    working_dir = kwargs.pop('working_dir')
    if is_local_filename(mets) and not isfile(get_local_filename(mets)):
        msg = "File does not exist: %s" % mets
        raise Exception(msg)
    resolver = Resolver()
    workspace = resolver.workspace_from_url(mets, working_dir)
    processor = Processor(workspace,
                          ocrd_tool=None,
                          page_id=kwargs['page_id'],
                          input_file_grp=kwargs['input_file_grp'],
                          output_file_grp=kwargs['output_file_grp'])
    for input_file in processor.input_files:
        for field in ['url', 'ID', 'mimetype', 'pageId']:
            # make this bash-friendly (show initialization for associative array)
            print("[%s]='%s'" % (field, getattr(input_file, field)), end=' ')
        print("[outputFileId]='%s'" %
              make_file_id(input_file, kwargs['output_file_grp']))
Example #5
0
class TestResolver(TestCase):
    def setUp(self):
        self.resolver = Resolver()
        self.folder = os.path.join(TMP_FOLDER, 'kant_aufklaerung_1784')
        if os.path.exists(TMP_FOLDER):
            rmtree(TMP_FOLDER)
            os.makedirs(TMP_FOLDER)
        copytree(FOLDER_KANT, self.folder)

    def test_workspace_from_url(self):
        workspace = self.resolver.workspace_from_url(METS_HEROLD)
        #  print(METS_HEROLD)
        #  print(workspace.mets)
        input_files = workspace.mets.find_files(fileGrp='OCR-D-IMG')
        #  print [str(f) for f in input_files]
        image_file = input_files[0]
        #  print(image_file)
        f = workspace.download_file(image_file)
        self.assertEqual(f.ID, 'FILE_0001_IMAGE')
        #  print(f)

    def test_unpack_workspace(self):
        workspace = self.resolver.unpack_workspace_from_filename(TEST_ZIP)
        files = workspace.mets.find_files(mimetype='image/tiff')
        self.assertEqual(len(files), 2, '2 TIF')
        for f in files:
            workspace.download_file(f)
        print(
            [OcrdExif.from_filename(f.local_filename).to_xml() for f in files])
Example #6
0
def ocrd_cli_wrap_processor(processorClass,
                            ocrd_tool=None,
                            mets=None,
                            working_dir=None,
                            dump_json=False,
                            version=False,
                            **kwargs):
    if dump_json:
        processorClass(workspace=None, dump_json=True)
    elif version:
        p = processorClass(workspace=None)
        print("Version %s, ocrd/core %s" % (p.version, OCRD_VERSION))
    elif mets is None:
        raise Exception('Error: Missing option "-m" / "--mets".')
    else:
        if mets.find('://') == -1:
            mets = 'file://' + os.path.abspath(mets)
        if mets.startswith('file://') and not os.path.exists(
                mets[len('file://'):]):
            raise Exception("File does not exist: %s" % mets)
        resolver = Resolver()
        workspace = resolver.workspace_from_url(mets, working_dir)
        run_processor(processorClass,
                      ocrd_tool,
                      mets,
                      workspace=workspace,
                      **kwargs)
Example #7
0
def test_workspace_from_url_kant(mock_request, tmp_path):

    # arrange
    url_src = 'https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xml'
    mock_request.side_effect = request_behavior
    dst_dir = tmp_path / 'workspace_kant'
    dst_dir.mkdir()

    # act
    resolver = Resolver()
    resolver.workspace_from_url(url_src, mets_basename='foo.xml', dst_dir=dst_dir)

    # assert
    local_path = dst_dir / 'foo.xml'
    assert os.path.isfile(str(local_path))
    # 1 time data was requested
    assert mock_request.call_count == 1
Example #8
0
 def test_validate_ocrd_file(self):
     resolver = Resolver()
     workspace = resolver.workspace_from_url(assets.url_of('glyph-consistency/data/mets.xml'))
     ocrd_file = workspace.mets.find_files(ID="FAULTY_GLYPHS_FILE")[0]
     if not ocrd_file.local_filename:
         workspace.download_file(ocrd_file)
     report = PageValidator.validate(ocrd_file=ocrd_file)
     self.assertEqual(len(report.errors), 17, 'errors')
Example #9
0
def _fixture_workspace_sample_features(tmp_path):
    copytree('tests/data/sample-features', str(tmp_path))
    resolver = Resolver()
    ws = resolver.workspace_from_url(join(str(tmp_path), 'mets.xml'))
    prev_path = abspath(curdir)
    chdir(tmp_path)
    yield ws
    chdir(prev_path)
Example #10
0
def _fixture_workspace_gutachten_data(tmp_path):
    copytree(assets.path_to('gutachten/data'), str(tmp_path))
    resolver = Resolver()
    ws = resolver.workspace_from_url(join(str(tmp_path), 'mets.xml'))
    prev_path = abspath(curdir)
    chdir(tmp_path)
    yield ws
    chdir(prev_path)
Example #11
0
 def runTest(self):
     resolver = Resolver()
     workspace = resolver.workspace_from_url(METS_HEROLD_SMALL,
                                             dst_dir=WORKSPACE_DIR)
     TesserocrSegmentRegion(workspace,
                            input_file_grp="OCR-D-IMG",
                            output_file_grp="OCR-D-SEG-BLOCK").process()
     workspace.save_mets()
Example #12
0
 def test_validate_ocrd_file(self):
     resolver = Resolver()
     workspace = resolver.workspace_from_url(
         assets.url_of('glyph-consistency/data/mets.xml'))
     with pushd_popd(workspace.directory):
         ocrd_file = workspace.mets.find_files(ID="FAULTY_GLYPHS_FILE")[0]
         report = PageValidator.validate(ocrd_file=ocrd_file)
         self.assertEqual(len(report.errors), 17, 'errors')
Example #13
0
def _fixture_workspace_kant_aufklaerung(tmp_path):
    copytree(assets.path_to('kant_aufklaerung_1784/data/'), str(tmp_path))
    resolver = Resolver()
    ws = resolver.workspace_from_url(join(tmp_path, 'mets.xml'),
                                     src_baseurl=tmp_path)
    prev_dir = abspath(curdir)
    chdir(tmp_path)
    yield ws
    chdir(prev_dir)
Example #14
0
 def test_run1(self):
     resolver = Resolver()
     workspace = resolver.workspace_from_url(assets.url_of('kant_aufklaerung_1784-binarized/data/mets.xml'), dst_dir=WORKSPACE_DIR)
     proc = KrakenSegment(
         workspace,
         input_file_grp="OCR-D-IMG-BIN",
         output_file_grp="OCR-D-SEG-LINE-KRAKEN",
         parameter={'level-of-operation': 'line'}
     )
     proc.process()
     workspace.save_mets()
Example #15
0
 def test_param_json(self):
     resolver = Resolver()
     workspace = resolver.workspace_from_url(
         assets.url_of('SBB0000F29300010000/data/mets_one_file.xml'),
         dst_dir=WORKSPACE_DIR)
     run_processor(KrakenOcr,
                   resolver=resolver,
                   workspace=workspace,
                   input_file_grp="INPUT",
                   output_file_grp="OCR-D-OCR-KRAKEN")
     workspace.save_mets()
 def runTest(self):
     resolver = Resolver(cache_enabled=True)
     workspace = resolver.workspace_from_url(METS_HEROLD_SMALL,
                                             directory=WORKSPACE_DIR)
     TesserocrSegmentRegion(workspace,
                            input_file_grp="INPUT",
                            output_file_grp="OCR-D-SEG-BLOCK").process()
     #  workspace.save_mets()
     TesserocrSegmentLine(workspace,
                          input_file_grp="OCR-D-SEG-BLOCK",
                          output_file_grp="OCR-D-SEG-LINE").process()
     workspace.save_mets()
Example #17
0
def run_tasks(mets, log_level, page_id, task_strs):
    resolver = Resolver()
    workspace = resolver.workspace_from_url(mets)
    log = getLogger('ocrd.task_sequence')
    tasks = [ProcessorTask.parse(task_str) for task_str in task_strs]

    for task in tasks:

        task.validate()

        # check input file groups are in mets
        for input_file_grp in task.input_file_grps:
            if not input_file_grp in workspace.mets.file_groups:
                raise Exception(
                    "Unmet requirement: expected input file group not contained in mets: %s"
                    % input_file_grp)

        for output_file_grp in task.output_file_grps:
            if output_file_grp in workspace.mets.file_groups:
                raise Exception(
                    "Conflict: output file group already contained in mets: %s"
                    % output_file_grp)

        log.info("Start processing task '%s'", task)

        # execute cli
        returncode = run_cli(task.executable,
                             mets,
                             resolver,
                             workspace,
                             log_level=log_level,
                             page_id=page_id,
                             input_file_grp=','.join(task.input_file_grps),
                             output_file_grp=','.join(task.output_file_grps),
                             parameter=task.parameter_path)

        # check return code
        if returncode != 0:
            raise Exception("%s exited with non-zero return value %s" %
                            (task.executable, returncode))

        log.info("Finished processing task '%s'", task)

        # reload mets
        workspace.reload_mets()

        # check output file groups are in mets
        for output_file_grp in task.output_file_grps:
            if not output_file_grp in workspace.mets.file_groups:
                raise Exception(
                    "Invalid state: expected output file group not in mets: %s"
                    % output_file_grp)
Example #18
0
def test_workspace_from_url_kant_with_resources(mock_request, tmp_path):

    # arrange
    url_src = 'https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xml'
    mock_request.side_effect = request_behavior
    dst_dir = tmp_path / 'workspace_kant'
    dst_dir.mkdir()

    # act
    resolver = Resolver()
    resolver.workspace_from_url(url_src, mets_basename='kant_aufklaerung_1784.xml', dst_dir=dst_dir, download=True)

    # assert files present under local tmp_path
    local_path_mets = dst_dir / 'kant_aufklaerung_1784.xml'
    assert os.path.isfile(str(local_path_mets))
    local_path_img1 = dst_dir / 'OCR-D-IMG' / 'INPUT_0017.tif'
    assert os.path.isfile(str(local_path_img1))
    local_path_page1 = dst_dir / 'OCR-D-GT-PAGE' / 'PAGE_0017_PAGE.xml'
    assert os.path.isfile(str(local_path_page1))

    # 1 METS/MODS + 2 images + 4 OCR files = 7 requests
    assert mock_request.call_count == 7
Example #19
0
 def test_run1(self):
     resolver = Resolver()
     with TemporaryDirectory() as tempdir:
         workspace = resolver.workspace_from_url(assets.path_to(
             'kant_aufklaerung_1784-binarized/data/mets.xml'),
                                                 dst_dir=tempdir)
         proc = OcropySegment(
             workspace,
             input_file_grp="OCR-D-IMG-BIN",
             output_file_grp="OCR-D-SEG-OCROPY-TEST",
             page_id='P_0017',
         )
         #  print(proc.parameter)
         proc.process()
         workspace.save_mets()
 def runTest(self):
     resolver = Resolver()
     #  workspace = resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/mets_one_file.xml'), directory=WORKSPACE_DIR)
     workspace = resolver.workspace_from_url(
         assets.url_of('kant_aufklaerung_1784-binarized/mets.xml'),
         directory=WORKSPACE_DIR)
     TesserocrSegmentRegion(workspace,
                            input_file_grp="OCR-D-IMG",
                            output_file_grp="OCR-D-SEG-BLOCK").process()
     TesserocrSegmentLine(workspace,
                          input_file_grp="OCR-D-SEG-BLOCK",
                          output_file_grp="OCR-D-SEG-LINE").process()
     TesserocrSegmentWord(workspace,
                          input_file_grp="OCR-D-SEG-LINE",
                          output_file_grp="OCR-D-SEG-WORD").process()
     workspace.save_mets()
Example #21
0
class TestResolver(TestCase):

    def setUp(self):
        self.resolver = Resolver()
        self.workspace = self.resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/mets.xml'))

    def test_verify(self):
        proc = DummyProcessor(self.workspace)
        self.assertEquals(proc.verify(), True)

    def test_json(self):
        DummyProcessor(self.workspace, dump_json=True)

    def test_params(self):
        proc = Processor(workspace=self.workspace)
        self.assertEquals(proc.parameter, {})
Example #22
0
 def test_422(self):
     """
     # OCR-D/core#422
     """
     resolver = Resolver()
     with TemporaryDirectory() as tempdir:
         workspace = resolver.workspace_from_url(
             assets.path_to('kant_aufklaerung_1784/data/mets.xml'),
             dst_dir=tempdir)
         validate_tasks([
             ProcessorTask.parse(x) for x in [
                 "sample-processor -I OCR-D-IMG       -O OCR-D-SEG-BLOCK",
                 "sample-processor -I OCR-D-SEG-BLOCK -O OCR-D-SEG-LINE",
                 "sample-processor -I OCR-D-SEG-LINE  -O OCR-D-SEG-WORD",
                 "sample-processor -I OCR-D-SEG-WORD  -O OCR-D-OCR-TESS",
             ]
         ], workspace)
Example #23
0
def run_tasks(mets, log_level, page_id, task_strs, overwrite=False):
    resolver = Resolver()
    workspace = resolver.workspace_from_url(mets)
    log = getLogger('ocrd.task_sequence.run_tasks')
    tasks = [ProcessorTask.parse(task_str) for task_str in task_strs]

    validate_tasks(tasks, workspace, page_id, overwrite)

    # Run the tasks
    for task in tasks:

        log.info("Start processing task '%s'", task)

        # execute cli
        returncode, out, err = run_cli(
            task.executable,
            mets,
            resolver,
            workspace,
            log_level=log_level,
            page_id=page_id,
            overwrite=overwrite,
            input_file_grp=','.join(task.input_file_grps),
            output_file_grp=','.join(task.output_file_grps),
            parameter=json.dumps(task.parameters))

        # check return code
        if returncode != 0:
            raise Exception(
                "%s exited with non-zero return value %s. STDOUT:\n%s\nSTDERR:\n%s"
                % (task.executable, returncode, out, err))

        log.info("Finished processing task '%s'", task)

        # reload mets
        workspace.reload_mets()

        # check output file groups are in mets
        for output_file_grp in task.output_file_grps:
            if not output_file_grp in workspace.mets.file_groups:
                raise Exception(
                    "Invalid state: expected output file group not in mets: %s\nSTDOUT:\n%s\nSTDERR:\n%s"
                    % (output_file_grp, out, err))
Example #24
0
def ocrd_cli_wrap_processor(processorClass,
                            ocrd_tool=None,
                            mets=None,
                            working_dir=None,
                            cache_enabled=True,
                            *args,
                            **kwargs):
    if mets.find('://') == -1:
        mets = 'file://' + mets
    if mets.startswith('file://') and not os.path.exists(
            mets[len('file://'):]):
        raise Exception("File does not exist: %s" % mets)
    resolver = Resolver(cache_enabled=cache_enabled)
    workspace = resolver.workspace_from_url(mets, working_dir)
    run_processor(processorClass,
                  ocrd_tool,
                  mets,
                  workspace=workspace,
                  *args,
                  **kwargs)
def workspace():
    if os.path.exists(WORKSPACE_DIR):
        shutil.rmtree(WORKSPACE_DIR)
    os.makedirs(WORKSPACE_DIR)

    resolver = Resolver()
    workspace = resolver.workspace_from_url(METS_KANT, dst_dir=WORKSPACE_DIR)

    # XXX Work around data bug(?):
    #     PAGE-XML links to OCR-D-IMG/INPUT_0017.tif, but this is nothing core can download
    os.makedirs(os.path.join(WORKSPACE_DIR, 'OCR-D-IMG'))
    for f in ['INPUT_0017.tif', 'INPUT_0020.tif']:
        urllib.request.urlretrieve(
            "https://github.com/OCR-D/assets/raw/master/data/kant_aufklaerung_1784/data/OCR-D-IMG/" + f,
            os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f))

    # The binarization options I have are:
    #
    # a. ocrd_kraken which tries to install cltsm, whose installation is borken on my machine (protobuf)
    # b. ocrd_olena which 1. I cannot fully install via pip and 2. whose dependency olena doesn't compile on my
    #    machine
    # c. just fumble with the original files
    #
    # So I'm going for option c.
    for f in ['INPUT_0017.tif', 'INPUT_0020.tif']:
        ff = os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f)
        subprocess.call(['convert', ff, '-threshold', '50%', ff])

    # Remove GT Words and TextEquivs, to not accidently check GT text instead of the OCR text
    # XXX Review data again
    # XXX Make this more robust against namespace version changes
    for of in workspace.mets.find_files(fileGrp="OCR-D-GT-SEG-LINE"):
        workspace.download_file(of)
    for to_remove in ["//pc:Word", "//pc:TextEquiv"]:
        for ff in glob(os.path.join(WORKSPACE_DIR, "OCR-D-GT-SEG-LINE", "*")):
            tree = etree.parse(ff)
            for e in tree.xpath(to_remove, namespaces=NSMAP_GT):
                e.getparent().remove(e)
            tree.write(ff, xml_declaration=True, encoding="utf-8")

    return workspace
Example #26
0
def workspace():
    if os.path.exists(WORKSPACE_DIR):
        shutil.rmtree(WORKSPACE_DIR)
    os.makedirs(WORKSPACE_DIR)

    resolver = Resolver()
    # due to core#809 this does not always work:
    #workspace = resolver.workspace_from_url(METS_KANT, dst_dir=WORKSPACE_DIR)
    # workaround:
    shutil.rmtree(WORKSPACE_DIR)
    shutil.copytree(os.path.dirname(METS_KANT), WORKSPACE_DIR)
    workspace = resolver.workspace_from_url(
        os.path.join(WORKSPACE_DIR, 'mets.xml'))

    # The binarization options I have are:
    #
    # a. ocrd_kraken which tries to install cltsm, whose installation is borken on my machine (protobuf)
    # b. ocrd_olena which 1. I cannot fully install via pip and 2. whose dependency olena doesn't compile on my
    #    machine
    # c. just fumble with the original files
    #
    # So I'm going for option c.
    for imgf in workspace.mets.find_files(fileGrp="OCR-D-IMG"):
        imgf = workspace.download_file(imgf)
        path = os.path.join(workspace.directory, imgf.local_filename)
        subprocess.call(['mogrify', '-threshold', '50%', path])

    # Remove GT Words and TextEquivs, to not accidently check GT text instead of the OCR text
    # XXX Review data again
    for of in workspace.mets.find_files(fileGrp="OCR-D-GT-SEG-WORD-GLYPH"):
        workspace.download_file(of)
        path = os.path.join(workspace.directory, of.local_filename)
        tree = etree.parse(path)
        nsmap_gt = {"pc": page_namespace(tree)}
        for to_remove in ["//pc:Word", "//pc:TextEquiv"]:
            for e in tree.xpath(to_remove, namespaces=nsmap_gt):
                e.getparent().remove(e)
        tree.write(path, xml_declaration=True, encoding="utf-8")
        assertFileDoesNotContain(path, "TextEquiv")

    return workspace
Example #27
0
 def runTest(self):
     resolver = Resolver(cache_enabled=True)
     #  workspace = resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/mets_one_file.xml'), directory=WORKSPACE_DIR)
     workspace = resolver.workspace_from_url(assets.url_of(
         'kant_aufklaerung_1784-page-block-line-word/mets.xml'),
                                             directory=WORKSPACE_DIR)
     TesserocrSegmentRegion(workspace,
                            input_file_grp="OCR-D-IMG",
                            output_file_grp="OCR-D-SEG-BLOCK").process()
     workspace.save_mets()
     TesserocrSegmentLine(workspace,
                          input_file_grp="OCR-D-SEG-BLOCK",
                          output_file_grp="OCR-D-SEG-LINE").process()
     workspace.save_mets()
     TesserocrRecognize(workspace,
                        input_file_grp="OCR-D-SEG-LINE",
                        output_file_grp="OCR-D-OCR-TESS",
                        parameter={
                            'textequiv_level': 'word'
                        }).process()
     workspace.save_mets()
Example #28
0
 def test_task_run(self):
     resolver = Resolver()
     with copy_of_directory(
             assets.path_to('kant_aufklaerung_1784/data')) as wsdir:
         with pushd_popd(wsdir):
             ws = resolver.workspace_from_url('mets.xml')
             ws.add_file('GRP0',
                         content='',
                         local_filename='GRP0/foo',
                         ID='file0',
                         mimetype=MIMETYPE_PAGE,
                         pageId=None)
             ws.save_mets()
             files_before = len(ws.mets.find_files())
             run_tasks('mets.xml', 'DEBUG', None, [
                 "dummy -I OCR-D-IMG -O GRP1",
                 "dummy -I GRP1 -O GRP2",
             ])
             ws.reload_mets()
             # step 1: 2 images in OCR-D-IMG -> 2 images 2 PAGEXML in GRP1
             # step 2: 2 images and 2 PAGEXML in GRP1 -> process just the PAGEXML
             self.assertEqual(len(ws.mets.find_files()), files_before + 6)
Example #29
0
    def runTest(self):
        resolver = Resolver()
        workspace = resolver.workspace_from_url(METS_HEROLD_SMALL,
                                                dst_dir=WORKSPACE_DIR)
        TesserocrSegmentRegion(workspace,
                               input_file_grp="OCR-D-IMG",
                               output_file_grp="OCR-D-SEG-BLOCK").process()
        workspace.save_mets()

        TesserocrSegmentLine(workspace,
                             input_file_grp="OCR-D-SEG-BLOCK",
                             output_file_grp="OCR-D-SEG-LINE").process()
        workspace.save_mets()

        TesserocrRecognize(
            workspace,
            input_file_grp="OCR-D-SEG-LINE",
            output_file_grp="OCR-D-OCR-TESS",
            parameter={
                'textequiv_level': 'line'
            }  # add dep tesseract-ocr-script-frak: , 'model': 'Fraktur'
        ).process()
        workspace.save_mets()

        TesserocrSegmentWord(workspace,
                             input_file_grp="OCR-D-SEG-LINE",
                             output_file_grp="OCR-D-SEG-WORD").process()
        workspace.save_mets()

        TesserocrRecognize(
            workspace,
            input_file_grp="OCR-D-SEG-WORD",
            output_file_grp="OCR-D-OCR-TESS-W2C",
            parameter={
                'textequiv_level': 'glyph'
            }  # add dep tesseract-ocr-script-frak: , 'model': 'Fraktur'}
        ).process()
        workspace.save_mets()
def prepare_workspace(task: dict, resolver: Resolver,
                      dst_dir: str) -> Workspace:
    """Prepare a workspace and return it."""
    mets_basename = "mets.xml"

    workspace = resolver.workspace_from_url(task["src"],
                                            dst_dir=dst_dir,
                                            mets_basename=mets_basename,
                                            clobber_mets=True)

    if task["default_file_grp"] == "MAX" and "MAX" not in workspace.mets.file_groups:
        for file_name in workspace.mets.find_files(fileGrp="DEFAULT"):
            workspace.download_file(
                add_max_file_to_workspace(workspace, file_name))
    else:
        for file_name in workspace.mets.find_files(
                fileGrp=task["default_file_grp"]):
            if not file_name.local_filename:
                workspace.download_file(file_name)

    workspace.save_mets()

    return workspace