def test_handle_response_for_invalid_content(mock_get, response_dir):
    """If invalid content is returned, store warning log entry"""

    # arrange
    url = 'http://digital.bibliothek.uni-halle.de/hd/oai/?verb=GetRecord&metadataPrefix=mets&mode=xml&identifier=foo'
    mock_get.return_value.status_code = 200
    mock_get.return_value.content = b'foo bar'
    headers = {'Content-Type': 'text/plain'}
    mock_get.return_value.headers = headers
    resolver = Resolver()
    initLogging()

    # capture log
    log = getLogger('ocrd_models.utils.handle_oai_response')
    capt = FIFOIO(256)
    sh = StreamHandler(capt)
    sh.setFormatter(Formatter(LOG_FORMAT))
    log.addHandler(sh)

    # act
    resolver.download_to_directory(response_dir, url)

    # assert
    mock_get.assert_called_once_with(url)
    log_output = capt.getvalue()
    assert 'WARNING ocrd_models.utils.handle_oai_response' in log_output
Esempio n. 2
0
def ocrd_cli_wrap_processor(processorClass,
                            ocrd_tool=None,
                            mets=None,
                            working_dir=None,
                            dump_json=False,
                            version=False,
                            **kwargs):
    if dump_json:
        processorClass(workspace=None, dump_json=True)
    elif version:
        p = processorClass(workspace=None)
        print("Version %s, ocrd/core %s" % (p.version, OCRD_VERSION))
    elif mets is None:
        raise Exception('Error: Missing option "-m" / "--mets".')
    else:
        if mets.find('://') == -1:
            mets = 'file://' + os.path.abspath(mets)
        if mets.startswith('file://') and not os.path.exists(
                mets[len('file://'):]):
            raise Exception("File does not exist: %s" % mets)
        resolver = Resolver()
        workspace = resolver.workspace_from_url(mets, working_dir)
        run_processor(processorClass,
                      ocrd_tool,
                      mets,
                      workspace=workspace,
                      **kwargs)
Esempio n. 3
0
class TestResolver(TestCase):
    def setUp(self):
        self.resolver = Resolver()
        self.folder = os.path.join(TMP_FOLDER, 'kant_aufklaerung_1784')
        if os.path.exists(TMP_FOLDER):
            rmtree(TMP_FOLDER)
            os.makedirs(TMP_FOLDER)
        copytree(FOLDER_KANT, self.folder)

    def test_workspace_from_url(self):
        workspace = self.resolver.workspace_from_url(METS_HEROLD)
        #  print(METS_HEROLD)
        #  print(workspace.mets)
        input_files = workspace.mets.find_files(fileGrp='OCR-D-IMG')
        #  print [str(f) for f in input_files]
        image_file = input_files[0]
        #  print(image_file)
        f = workspace.download_file(image_file)
        self.assertEqual(f.ID, 'FILE_0001_IMAGE')
        #  print(f)

    def test_unpack_workspace(self):
        workspace = self.resolver.unpack_workspace_from_filename(TEST_ZIP)
        files = workspace.mets.find_files(mimetype='image/tiff')
        self.assertEqual(len(files), 2, '2 TIF')
        for f in files:
            workspace.download_file(f)
        print(
            [OcrdExif.from_filename(f.local_filename).to_xml() for f in files])
 def test_validate_ocrd_file(self):
     resolver = Resolver()
     workspace = resolver.workspace_from_url(assets.url_of('glyph-consistency/data/mets.xml'))
     with pushd_popd(workspace.directory):
         ocrd_file = workspace.mets.find_all_files(ID="FAULTY_GLYPHS_FILE")[0]
         report = PageValidator.validate(ocrd_file=ocrd_file)
         self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 17, '17 textequiv consistency errors')
Esempio n. 5
0
def bashlib_input_files(**kwargs):
    """
    List input files for processing

    Instantiate a processor and workspace from the given processing options.
    Then loop through the input files of the input fileGrp, and for each one,
    print its `url`, `ID`, `mimetype` and `pageId`, as well as its recommended
    `outputFileId` (from ``make_file_id``).

    (The printing format is one associative array initializer per line.)
    """
    initLogging()
    mets = kwargs.pop('mets')
    working_dir = kwargs.pop('working_dir')
    if is_local_filename(mets) and not isfile(get_local_filename(mets)):
        msg = "File does not exist: %s" % mets
        raise Exception(msg)
    resolver = Resolver()
    workspace = resolver.workspace_from_url(mets, working_dir)
    processor = Processor(workspace,
                          ocrd_tool=None,
                          page_id=kwargs['page_id'],
                          input_file_grp=kwargs['input_file_grp'],
                          output_file_grp=kwargs['output_file_grp'])
    for input_file in processor.input_files:
        for field in ['url', 'ID', 'mimetype', 'pageId']:
            # make this bash-friendly (show initialization for associative array)
            print("[%s]='%s'" % (field, getattr(input_file, field)), end=' ')
        print("[outputFileId]='%s'" %
              make_file_id(input_file, kwargs['output_file_grp']))
Esempio n. 6
0
 def test_overwrite(self):
     resolver = Resolver()
     with TemporaryDirectory() as tempdir:
         workspace = resolver.workspace_from_url(
             assets.path_to('kant_aufklaerung_1784/data/mets.xml'),
             dst_dir=tempdir)
         # should fail at step 3
         workspace.mets.add_file('OCR-D-SEG-WORD',
                                 url='foo/bar',
                                 ID='foo',
                                 pageId='page1',
                                 mimetype='image/tif')
         with self.assertRaisesRegex(
                 Exception,
                 r"Invalid task sequence input/output file groups: \[\"Output fileGrp\[@USE='OCR-D-SEG-WORD'\] already in METS!\"\]"
         ):
             validate_tasks([
                 ProcessorTask.parse(x) for x in [
                     "sample-processor -I OCR-D-IMG       -O OCR-D-SEG-BLOCK",
                     "sample-processor -I OCR-D-SEG-BLOCK -O OCR-D-SEG-LINE",
                     "sample-processor -I OCR-D-SEG-LINE  -O OCR-D-SEG-WORD",
                     "sample-processor -I OCR-D-SEG-WORD  -O OCR-D-OCR-TESS",
                 ]
             ], workspace)
         # should succeed b/c overwrite
         validate_tasks([
             ProcessorTask.parse(x) for x in [
                 "sample-processor -I OCR-D-IMG       -O OCR-D-SEG-BLOCK",
                 "sample-processor -I OCR-D-SEG-BLOCK -O OCR-D-SEG-LINE",
                 "sample-processor -I OCR-D-SEG-LINE  -O OCR-D-SEG-WORD",
                 "sample-processor -I OCR-D-SEG-WORD  -O OCR-D-OCR-TESS",
             ]
         ],
                        workspace,
                        overwrite=True)
Esempio n. 7
0
def _fixture_plain_workspace(tmp_path):
    resolver = Resolver()
    ws = resolver.workspace_from_nothing(directory=tmp_path)
    prev_dir = abspath(curdir)
    chdir(tmp_path)
    yield ws
    chdir(prev_dir)
Esempio n. 8
0
    def test_validate_sequence(self):
        resolver = Resolver()
        with TemporaryDirectory() as tempdir:
            workspace = resolver.workspace_from_url(
                assets.path_to('kant_aufklaerung_1784/data/mets.xml'),
                dst_dir=tempdir)
            params_path = Path(tempdir, 'params.json')
            params_path.write_text('{"param1": true}')

            with self.assertRaisesRegex(
                    Exception,
                    "Input file group not contained in METS or produced by previous steps: FOO'"
            ):
                validate_tasks([
                    ProcessorTask.parse(x) for x in [
                        '%s -I OCR-D-IMG -O OUT1 -p %s' %
                        (SAMPLE_NAME_REQUIRED_PARAM, params_path),
                        '%s -I FOO -O OUT2 -p %s' %
                        (SAMPLE_NAME_REQUIRED_PARAM, params_path)
                    ]
                ], workspace)

            with self.assertRaisesRegex(
                    Exception, "Input fileGrp.@USE='IN'. not in METS!"):
                validate_tasks([
                    ProcessorTask.parse(x) for x in [
                        '%s -I IN -O OUT1 -p %s' %
                        (SAMPLE_NAME_REQUIRED_PARAM, params_path),
                    ]
                ], workspace)
Esempio n. 9
0
 def setUp(self):
     self.resolver = Resolver(cache_enabled=True)
     self.folder = os.path.join(TMP_FOLDER, 'kant_aufklaerung_1784')
     if os.path.exists(TMP_FOLDER):
         rmtree(TMP_FOLDER)
         os.makedirs(TMP_FOLDER)
     copytree(FOLDER_KANT, self.folder)
Esempio n. 10
0
 def setUp(self):
     self.resolver = Resolver()
     self.folder = join(TMP_FOLDER, 'kant_aufklaerung_1784')
     if exists(TMP_FOLDER):
         rmtree(TMP_FOLDER)
         os.makedirs(TMP_FOLDER)
     copytree(FOLDER_KANT, self.folder)
Esempio n. 11
0
 def test_validate_ocrd_file(self):
     resolver = Resolver()
     workspace = resolver.workspace_from_url(assets.url_of('glyph-consistency/data/mets.xml'))
     ocrd_file = workspace.mets.find_files(ID="FAULTY_GLYPHS_FILE")[0]
     if not ocrd_file.local_filename:
         workspace.download_file(ocrd_file)
     report = PageValidator.validate(ocrd_file=ocrd_file)
     self.assertEqual(len(report.errors), 17, 'errors')
Esempio n. 12
0
def _fixture_workspace_sample_features(tmp_path):
    copytree('tests/data/sample-features', str(tmp_path))
    resolver = Resolver()
    ws = resolver.workspace_from_url(join(str(tmp_path), 'mets.xml'))
    prev_path = abspath(curdir)
    chdir(tmp_path)
    yield ws
    chdir(prev_path)
Esempio n. 13
0
 def test_validate_ocrd_file(self):
     resolver = Resolver()
     workspace = resolver.workspace_from_url(
         assets.url_of('glyph-consistency/data/mets.xml'))
     with pushd_popd(workspace.directory):
         ocrd_file = workspace.mets.find_files(ID="FAULTY_GLYPHS_FILE")[0]
         report = PageValidator.validate(ocrd_file=ocrd_file)
         self.assertEqual(len(report.errors), 17, 'errors')
Esempio n. 14
0
def _fixture_workspace_gutachten_data(tmp_path):
    copytree(assets.path_to('gutachten/data'), str(tmp_path))
    resolver = Resolver()
    ws = resolver.workspace_from_url(join(str(tmp_path), 'mets.xml'))
    prev_path = abspath(curdir)
    chdir(tmp_path)
    yield ws
    chdir(prev_path)
Esempio n. 15
0
 def runTest(self):
     resolver = Resolver()
     workspace = resolver.workspace_from_url(METS_HEROLD_SMALL,
                                             dst_dir=WORKSPACE_DIR)
     TesserocrSegmentRegion(workspace,
                            input_file_grp="OCR-D-IMG",
                            output_file_grp="OCR-D-SEG-BLOCK").process()
     workspace.save_mets()
Esempio n. 16
0
def test_resolve_image0():
    workspace = Resolver().workspace_from_url(METS_HEROLD)
    input_files = workspace.mets.find_all_files(fileGrp='OCR-D-IMG')
    f = input_files[0]
    img_pil1 = workspace._resolve_image_as_pil(f.url)
    assert img_pil1.size == (2875, 3749)
    img_pil2 = workspace._resolve_image_as_pil(f.url, [[0, 0], [1, 1]])
    assert img_pil2.size == (1, 1)
Esempio n. 17
0
def _fixture_workspace_kant_aufklaerung(tmp_path):
    copytree(assets.path_to('kant_aufklaerung_1784/data/'), str(tmp_path))
    resolver = Resolver()
    ws = resolver.workspace_from_url(join(tmp_path, 'mets.xml'),
                                     src_baseurl=tmp_path)
    prev_dir = abspath(curdir)
    chdir(tmp_path)
    yield ws
    chdir(prev_dir)
Esempio n. 18
0
def test_resolve_image_as_pil_deprecated():
    url_path = os.path.join(assets.url_of('kant_aufklaerung_1784-binarized'), 'data/mets.xml')
    workspace = Resolver().workspace_from_url(url_path)
    with pytest.warns(DeprecationWarning) as record:
        workspace.resolve_image_as_pil('OCR-D-IMG-NRM/OCR-D-IMG-NRM_0017.png')

    # assert
    assert len(record) == 1
    assert 'Call to deprecated method resolve_image_as_pil.' in str(record[0].message)
Esempio n. 19
0
def test_workspace_from_url0():

    # act
    workspace = Resolver().workspace_from_url(METS_HEROLD)
    input_files = workspace.mets.find_all_files(fileGrp='OCR-D-IMG')
    image_file = input_files[0]
    f = workspace.download_file(image_file)

    # assert
    assert '%s.tif' % f.ID == 'FILE_0001_IMAGE.tif'
    assert f.local_filename == 'OCR-D-IMG/FILE_0001_IMAGE.tif'
Esempio n. 20
0
 def test_run1(self):
     resolver = Resolver()
     workspace = resolver.workspace_from_url(assets.url_of('kant_aufklaerung_1784-binarized/data/mets.xml'), dst_dir=WORKSPACE_DIR)
     proc = KrakenSegment(
         workspace,
         input_file_grp="OCR-D-IMG-BIN",
         output_file_grp="OCR-D-SEG-LINE-KRAKEN",
         parameter={'level-of-operation': 'line'}
     )
     proc.process()
     workspace.save_mets()
Esempio n. 21
0
 def test_param_json(self):
     resolver = Resolver()
     workspace = resolver.workspace_from_url(
         assets.url_of('SBB0000F29300010000/data/mets_one_file.xml'),
         dst_dir=WORKSPACE_DIR)
     run_processor(KrakenOcr,
                   resolver=resolver,
                   workspace=workspace,
                   input_file_grp="INPUT",
                   output_file_grp="OCR-D-OCR-KRAKEN")
     workspace.save_mets()
 def runTest(self):
     resolver = Resolver(cache_enabled=True)
     workspace = resolver.workspace_from_url(METS_HEROLD_SMALL,
                                             directory=WORKSPACE_DIR)
     TesserocrSegmentRegion(workspace,
                            input_file_grp="INPUT",
                            output_file_grp="OCR-D-SEG-BLOCK").process()
     #  workspace.save_mets()
     TesserocrSegmentLine(workspace,
                          input_file_grp="OCR-D-SEG-BLOCK",
                          output_file_grp="OCR-D-SEG-LINE").process()
     workspace.save_mets()
Esempio n. 23
0
def run_tasks(mets, log_level, page_id, task_strs):
    resolver = Resolver()
    workspace = resolver.workspace_from_url(mets)
    log = getLogger('ocrd.task_sequence')
    tasks = [ProcessorTask.parse(task_str) for task_str in task_strs]

    for task in tasks:

        task.validate()

        # check input file groups are in mets
        for input_file_grp in task.input_file_grps:
            if not input_file_grp in workspace.mets.file_groups:
                raise Exception(
                    "Unmet requirement: expected input file group not contained in mets: %s"
                    % input_file_grp)

        for output_file_grp in task.output_file_grps:
            if output_file_grp in workspace.mets.file_groups:
                raise Exception(
                    "Conflict: output file group already contained in mets: %s"
                    % output_file_grp)

        log.info("Start processing task '%s'", task)

        # execute cli
        returncode = run_cli(task.executable,
                             mets,
                             resolver,
                             workspace,
                             log_level=log_level,
                             page_id=page_id,
                             input_file_grp=','.join(task.input_file_grps),
                             output_file_grp=','.join(task.output_file_grps),
                             parameter=task.parameter_path)

        # check return code
        if returncode != 0:
            raise Exception("%s exited with non-zero return value %s" %
                            (task.executable, returncode))

        log.info("Finished processing task '%s'", task)

        # reload mets
        workspace.reload_mets()

        # check output file groups are in mets
        for output_file_grp in task.output_file_grps:
            if not output_file_grp in workspace.mets.file_groups:
                raise Exception(
                    "Invalid state: expected output file group not in mets: %s"
                    % output_file_grp)
Esempio n. 24
0
def test_workspace_from_nothing_noclobber(tmp_path):
    """Attempt to re-create workspace shall fail because already created
    """

    ws2 = Resolver().workspace_from_nothing(tmp_path)
    assert ws2.directory == tmp_path

    with pytest.raises(Exception) as exc:
        Resolver().workspace_from_nothing(tmp_path)

    # assert
    the_msg = "METS 'mets.xml' already exists in '%s' and clobber_mets not set" % tmp_path
    assert the_msg in str(exc)
Esempio n. 25
0
def test_download_url_without_baseurl_raises_exception(tmp_path):
    # arrange
    dst_mets = join(tmp_path, 'mets.xml')
    copyfile(SRC_METS, dst_mets)
    ws1 = Resolver().workspace_from_url(dst_mets)
    the_file = _url_to_file(SAMPLE_FILE_URL)

    # act
    with pytest.raises(Exception) as exc:
        ws1.download_file(the_file)

    # assert exception message contents
    assert "Already tried prepending baseurl '%s'" % str(tmp_path) in str(
        exc.value)
Esempio n. 26
0
 def test_run1(self):
     resolver = Resolver()
     with TemporaryDirectory() as tempdir:
         workspace = resolver.workspace_from_url(assets.path_to(
             'kant_aufklaerung_1784-binarized/data/mets.xml'),
                                                 dst_dir=tempdir)
         proc = OcropySegment(
             workspace,
             input_file_grp="OCR-D-IMG-BIN",
             output_file_grp="OCR-D-SEG-OCROPY-TEST",
             page_id='P_0017',
         )
         #  print(proc.parameter)
         proc.process()
         workspace.save_mets()
def test_workspace_remove_groups_unforce(workspace_directory):
    """Remove groups by pattern recursive"""

    # arrange
    original_data = ET.parse(os.path.join(workspace_directory,
                                          'mets.xml')).getroot()
    alto_groups = original_data.findall(
        './/{http://www.loc.gov/METS/}fileGrp[@USE="OCR-D-GT-ALTO"]')
    assert len(alto_groups) == 1
    altos = alto_groups[0].findall('.//{http://www.loc.gov/METS/}file')
    assert len(altos) == 2

    # act
    resolver = Resolver()
    workspace = Workspace(resolver, workspace_directory)
    workspace.remove_file_group('//OCR-D-GT.*', recursive=True)
    workspace.save_mets()

    # assert
    written_data = ET.parse(os.path.join(workspace_directory,
                                         'mets.xml')).getroot()
    assert written_data is not None
    groups_new = written_data.findall(
        './/{http://www.loc.gov/METS/}fileGrp[@USE="OCR-D-GT-ALTO"]')
    assert not groups_new
Esempio n. 28
0
 def runTest(self):
     resolver = Resolver()
     #  workspace = resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/mets_one_file.xml'), directory=WORKSPACE_DIR)
     workspace = resolver.workspace_from_url(
         assets.url_of('kant_aufklaerung_1784-binarized/mets.xml'),
         directory=WORKSPACE_DIR)
     TesserocrSegmentRegion(workspace,
                            input_file_grp="OCR-D-IMG",
                            output_file_grp="OCR-D-SEG-BLOCK").process()
     TesserocrSegmentLine(workspace,
                          input_file_grp="OCR-D-SEG-BLOCK",
                          output_file_grp="OCR-D-SEG-LINE").process()
     TesserocrSegmentWord(workspace,
                          input_file_grp="OCR-D-SEG-LINE",
                          output_file_grp="OCR-D-SEG-WORD").process()
     workspace.save_mets()
Esempio n. 29
0
def test_download_to_directory_with_badargs(url, basename, exc_msg):

    with pytest.raises(Exception) as exc:
        Resolver().download_to_directory(url, basename)

    # assert exception message contained
    assert exc_msg in str(exc)
Esempio n. 30
0
def test_workspace_init_missing_mets():
    """Raise Exception when missing mets-file in workspace"""

    with pytest.raises(Exception) as exc:
        Workspace(Resolver(), "foo/bar")

    assert "File does not exist" in str(exc.value)