def test_handle_response_for_invalid_content(mock_get, response_dir): """If invalid content is returned, store warning log entry""" # arrange url = 'http://digital.bibliothek.uni-halle.de/hd/oai/?verb=GetRecord&metadataPrefix=mets&mode=xml&identifier=foo' mock_get.return_value.status_code = 200 mock_get.return_value.content = b'foo bar' headers = {'Content-Type': 'text/plain'} mock_get.return_value.headers = headers resolver = Resolver() initLogging() # capture log log = getLogger('ocrd_models.utils.handle_oai_response') capt = FIFOIO(256) sh = StreamHandler(capt) sh.setFormatter(Formatter(LOG_FORMAT)) log.addHandler(sh) # act resolver.download_to_directory(response_dir, url) # assert mock_get.assert_called_once_with(url) log_output = capt.getvalue() assert 'WARNING ocrd_models.utils.handle_oai_response' in log_output
def ocrd_cli_wrap_processor(processorClass, ocrd_tool=None, mets=None, working_dir=None, dump_json=False, version=False, **kwargs): if dump_json: processorClass(workspace=None, dump_json=True) elif version: p = processorClass(workspace=None) print("Version %s, ocrd/core %s" % (p.version, OCRD_VERSION)) elif mets is None: raise Exception('Error: Missing option "-m" / "--mets".') else: if mets.find('://') == -1: mets = 'file://' + os.path.abspath(mets) if mets.startswith('file://') and not os.path.exists( mets[len('file://'):]): raise Exception("File does not exist: %s" % mets) resolver = Resolver() workspace = resolver.workspace_from_url(mets, working_dir) run_processor(processorClass, ocrd_tool, mets, workspace=workspace, **kwargs)
class TestResolver(TestCase): def setUp(self): self.resolver = Resolver() self.folder = os.path.join(TMP_FOLDER, 'kant_aufklaerung_1784') if os.path.exists(TMP_FOLDER): rmtree(TMP_FOLDER) os.makedirs(TMP_FOLDER) copytree(FOLDER_KANT, self.folder) def test_workspace_from_url(self): workspace = self.resolver.workspace_from_url(METS_HEROLD) # print(METS_HEROLD) # print(workspace.mets) input_files = workspace.mets.find_files(fileGrp='OCR-D-IMG') # print [str(f) for f in input_files] image_file = input_files[0] # print(image_file) f = workspace.download_file(image_file) self.assertEqual(f.ID, 'FILE_0001_IMAGE') # print(f) def test_unpack_workspace(self): workspace = self.resolver.unpack_workspace_from_filename(TEST_ZIP) files = workspace.mets.find_files(mimetype='image/tiff') self.assertEqual(len(files), 2, '2 TIF') for f in files: workspace.download_file(f) print( [OcrdExif.from_filename(f.local_filename).to_xml() for f in files])
def test_validate_ocrd_file(self): resolver = Resolver() workspace = resolver.workspace_from_url(assets.url_of('glyph-consistency/data/mets.xml')) with pushd_popd(workspace.directory): ocrd_file = workspace.mets.find_all_files(ID="FAULTY_GLYPHS_FILE")[0] report = PageValidator.validate(ocrd_file=ocrd_file) self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 17, '17 textequiv consistency errors')
def bashlib_input_files(**kwargs): """ List input files for processing Instantiate a processor and workspace from the given processing options. Then loop through the input files of the input fileGrp, and for each one, print its `url`, `ID`, `mimetype` and `pageId`, as well as its recommended `outputFileId` (from ``make_file_id``). (The printing format is one associative array initializer per line.) """ initLogging() mets = kwargs.pop('mets') working_dir = kwargs.pop('working_dir') if is_local_filename(mets) and not isfile(get_local_filename(mets)): msg = "File does not exist: %s" % mets raise Exception(msg) resolver = Resolver() workspace = resolver.workspace_from_url(mets, working_dir) processor = Processor(workspace, ocrd_tool=None, page_id=kwargs['page_id'], input_file_grp=kwargs['input_file_grp'], output_file_grp=kwargs['output_file_grp']) for input_file in processor.input_files: for field in ['url', 'ID', 'mimetype', 'pageId']: # make this bash-friendly (show initialization for associative array) print("[%s]='%s'" % (field, getattr(input_file, field)), end=' ') print("[outputFileId]='%s'" % make_file_id(input_file, kwargs['output_file_grp']))
def test_overwrite(self): resolver = Resolver() with TemporaryDirectory() as tempdir: workspace = resolver.workspace_from_url( assets.path_to('kant_aufklaerung_1784/data/mets.xml'), dst_dir=tempdir) # should fail at step 3 workspace.mets.add_file('OCR-D-SEG-WORD', url='foo/bar', ID='foo', pageId='page1', mimetype='image/tif') with self.assertRaisesRegex( Exception, r"Invalid task sequence input/output file groups: \[\"Output fileGrp\[@USE='OCR-D-SEG-WORD'\] already in METS!\"\]" ): validate_tasks([ ProcessorTask.parse(x) for x in [ "sample-processor -I OCR-D-IMG -O OCR-D-SEG-BLOCK", "sample-processor -I OCR-D-SEG-BLOCK -O OCR-D-SEG-LINE", "sample-processor -I OCR-D-SEG-LINE -O OCR-D-SEG-WORD", "sample-processor -I OCR-D-SEG-WORD -O OCR-D-OCR-TESS", ] ], workspace) # should succeed b/c overwrite validate_tasks([ ProcessorTask.parse(x) for x in [ "sample-processor -I OCR-D-IMG -O OCR-D-SEG-BLOCK", "sample-processor -I OCR-D-SEG-BLOCK -O OCR-D-SEG-LINE", "sample-processor -I OCR-D-SEG-LINE -O OCR-D-SEG-WORD", "sample-processor -I OCR-D-SEG-WORD -O OCR-D-OCR-TESS", ] ], workspace, overwrite=True)
def _fixture_plain_workspace(tmp_path): resolver = Resolver() ws = resolver.workspace_from_nothing(directory=tmp_path) prev_dir = abspath(curdir) chdir(tmp_path) yield ws chdir(prev_dir)
def test_validate_sequence(self): resolver = Resolver() with TemporaryDirectory() as tempdir: workspace = resolver.workspace_from_url( assets.path_to('kant_aufklaerung_1784/data/mets.xml'), dst_dir=tempdir) params_path = Path(tempdir, 'params.json') params_path.write_text('{"param1": true}') with self.assertRaisesRegex( Exception, "Input file group not contained in METS or produced by previous steps: FOO'" ): validate_tasks([ ProcessorTask.parse(x) for x in [ '%s -I OCR-D-IMG -O OUT1 -p %s' % (SAMPLE_NAME_REQUIRED_PARAM, params_path), '%s -I FOO -O OUT2 -p %s' % (SAMPLE_NAME_REQUIRED_PARAM, params_path) ] ], workspace) with self.assertRaisesRegex( Exception, "Input fileGrp.@USE='IN'. not in METS!"): validate_tasks([ ProcessorTask.parse(x) for x in [ '%s -I IN -O OUT1 -p %s' % (SAMPLE_NAME_REQUIRED_PARAM, params_path), ] ], workspace)
def setUp(self): self.resolver = Resolver(cache_enabled=True) self.folder = os.path.join(TMP_FOLDER, 'kant_aufklaerung_1784') if os.path.exists(TMP_FOLDER): rmtree(TMP_FOLDER) os.makedirs(TMP_FOLDER) copytree(FOLDER_KANT, self.folder)
def setUp(self): self.resolver = Resolver() self.folder = join(TMP_FOLDER, 'kant_aufklaerung_1784') if exists(TMP_FOLDER): rmtree(TMP_FOLDER) os.makedirs(TMP_FOLDER) copytree(FOLDER_KANT, self.folder)
def test_validate_ocrd_file(self): resolver = Resolver() workspace = resolver.workspace_from_url(assets.url_of('glyph-consistency/data/mets.xml')) ocrd_file = workspace.mets.find_files(ID="FAULTY_GLYPHS_FILE")[0] if not ocrd_file.local_filename: workspace.download_file(ocrd_file) report = PageValidator.validate(ocrd_file=ocrd_file) self.assertEqual(len(report.errors), 17, 'errors')
def _fixture_workspace_sample_features(tmp_path): copytree('tests/data/sample-features', str(tmp_path)) resolver = Resolver() ws = resolver.workspace_from_url(join(str(tmp_path), 'mets.xml')) prev_path = abspath(curdir) chdir(tmp_path) yield ws chdir(prev_path)
def test_validate_ocrd_file(self): resolver = Resolver() workspace = resolver.workspace_from_url( assets.url_of('glyph-consistency/data/mets.xml')) with pushd_popd(workspace.directory): ocrd_file = workspace.mets.find_files(ID="FAULTY_GLYPHS_FILE")[0] report = PageValidator.validate(ocrd_file=ocrd_file) self.assertEqual(len(report.errors), 17, 'errors')
def _fixture_workspace_gutachten_data(tmp_path): copytree(assets.path_to('gutachten/data'), str(tmp_path)) resolver = Resolver() ws = resolver.workspace_from_url(join(str(tmp_path), 'mets.xml')) prev_path = abspath(curdir) chdir(tmp_path) yield ws chdir(prev_path)
def runTest(self): resolver = Resolver() workspace = resolver.workspace_from_url(METS_HEROLD_SMALL, dst_dir=WORKSPACE_DIR) TesserocrSegmentRegion(workspace, input_file_grp="OCR-D-IMG", output_file_grp="OCR-D-SEG-BLOCK").process() workspace.save_mets()
def test_resolve_image0(): workspace = Resolver().workspace_from_url(METS_HEROLD) input_files = workspace.mets.find_all_files(fileGrp='OCR-D-IMG') f = input_files[0] img_pil1 = workspace._resolve_image_as_pil(f.url) assert img_pil1.size == (2875, 3749) img_pil2 = workspace._resolve_image_as_pil(f.url, [[0, 0], [1, 1]]) assert img_pil2.size == (1, 1)
def _fixture_workspace_kant_aufklaerung(tmp_path): copytree(assets.path_to('kant_aufklaerung_1784/data/'), str(tmp_path)) resolver = Resolver() ws = resolver.workspace_from_url(join(tmp_path, 'mets.xml'), src_baseurl=tmp_path) prev_dir = abspath(curdir) chdir(tmp_path) yield ws chdir(prev_dir)
def test_resolve_image_as_pil_deprecated(): url_path = os.path.join(assets.url_of('kant_aufklaerung_1784-binarized'), 'data/mets.xml') workspace = Resolver().workspace_from_url(url_path) with pytest.warns(DeprecationWarning) as record: workspace.resolve_image_as_pil('OCR-D-IMG-NRM/OCR-D-IMG-NRM_0017.png') # assert assert len(record) == 1 assert 'Call to deprecated method resolve_image_as_pil.' in str(record[0].message)
def test_workspace_from_url0(): # act workspace = Resolver().workspace_from_url(METS_HEROLD) input_files = workspace.mets.find_all_files(fileGrp='OCR-D-IMG') image_file = input_files[0] f = workspace.download_file(image_file) # assert assert '%s.tif' % f.ID == 'FILE_0001_IMAGE.tif' assert f.local_filename == 'OCR-D-IMG/FILE_0001_IMAGE.tif'
def test_run1(self): resolver = Resolver() workspace = resolver.workspace_from_url(assets.url_of('kant_aufklaerung_1784-binarized/data/mets.xml'), dst_dir=WORKSPACE_DIR) proc = KrakenSegment( workspace, input_file_grp="OCR-D-IMG-BIN", output_file_grp="OCR-D-SEG-LINE-KRAKEN", parameter={'level-of-operation': 'line'} ) proc.process() workspace.save_mets()
def test_param_json(self): resolver = Resolver() workspace = resolver.workspace_from_url( assets.url_of('SBB0000F29300010000/data/mets_one_file.xml'), dst_dir=WORKSPACE_DIR) run_processor(KrakenOcr, resolver=resolver, workspace=workspace, input_file_grp="INPUT", output_file_grp="OCR-D-OCR-KRAKEN") workspace.save_mets()
def runTest(self): resolver = Resolver(cache_enabled=True) workspace = resolver.workspace_from_url(METS_HEROLD_SMALL, directory=WORKSPACE_DIR) TesserocrSegmentRegion(workspace, input_file_grp="INPUT", output_file_grp="OCR-D-SEG-BLOCK").process() # workspace.save_mets() TesserocrSegmentLine(workspace, input_file_grp="OCR-D-SEG-BLOCK", output_file_grp="OCR-D-SEG-LINE").process() workspace.save_mets()
def run_tasks(mets, log_level, page_id, task_strs): resolver = Resolver() workspace = resolver.workspace_from_url(mets) log = getLogger('ocrd.task_sequence') tasks = [ProcessorTask.parse(task_str) for task_str in task_strs] for task in tasks: task.validate() # check input file groups are in mets for input_file_grp in task.input_file_grps: if not input_file_grp in workspace.mets.file_groups: raise Exception( "Unmet requirement: expected input file group not contained in mets: %s" % input_file_grp) for output_file_grp in task.output_file_grps: if output_file_grp in workspace.mets.file_groups: raise Exception( "Conflict: output file group already contained in mets: %s" % output_file_grp) log.info("Start processing task '%s'", task) # execute cli returncode = run_cli(task.executable, mets, resolver, workspace, log_level=log_level, page_id=page_id, input_file_grp=','.join(task.input_file_grps), output_file_grp=','.join(task.output_file_grps), parameter=task.parameter_path) # check return code if returncode != 0: raise Exception("%s exited with non-zero return value %s" % (task.executable, returncode)) log.info("Finished processing task '%s'", task) # reload mets workspace.reload_mets() # check output file groups are in mets for output_file_grp in task.output_file_grps: if not output_file_grp in workspace.mets.file_groups: raise Exception( "Invalid state: expected output file group not in mets: %s" % output_file_grp)
def test_workspace_from_nothing_noclobber(tmp_path): """Attempt to re-create workspace shall fail because already created """ ws2 = Resolver().workspace_from_nothing(tmp_path) assert ws2.directory == tmp_path with pytest.raises(Exception) as exc: Resolver().workspace_from_nothing(tmp_path) # assert the_msg = "METS 'mets.xml' already exists in '%s' and clobber_mets not set" % tmp_path assert the_msg in str(exc)
def test_download_url_without_baseurl_raises_exception(tmp_path): # arrange dst_mets = join(tmp_path, 'mets.xml') copyfile(SRC_METS, dst_mets) ws1 = Resolver().workspace_from_url(dst_mets) the_file = _url_to_file(SAMPLE_FILE_URL) # act with pytest.raises(Exception) as exc: ws1.download_file(the_file) # assert exception message contents assert "Already tried prepending baseurl '%s'" % str(tmp_path) in str( exc.value)
def test_run1(self): resolver = Resolver() with TemporaryDirectory() as tempdir: workspace = resolver.workspace_from_url(assets.path_to( 'kant_aufklaerung_1784-binarized/data/mets.xml'), dst_dir=tempdir) proc = OcropySegment( workspace, input_file_grp="OCR-D-IMG-BIN", output_file_grp="OCR-D-SEG-OCROPY-TEST", page_id='P_0017', ) # print(proc.parameter) proc.process() workspace.save_mets()
def test_workspace_remove_groups_unforce(workspace_directory): """Remove groups by pattern recursive""" # arrange original_data = ET.parse(os.path.join(workspace_directory, 'mets.xml')).getroot() alto_groups = original_data.findall( './/{http://www.loc.gov/METS/}fileGrp[@USE="OCR-D-GT-ALTO"]') assert len(alto_groups) == 1 altos = alto_groups[0].findall('.//{http://www.loc.gov/METS/}file') assert len(altos) == 2 # act resolver = Resolver() workspace = Workspace(resolver, workspace_directory) workspace.remove_file_group('//OCR-D-GT.*', recursive=True) workspace.save_mets() # assert written_data = ET.parse(os.path.join(workspace_directory, 'mets.xml')).getroot() assert written_data is not None groups_new = written_data.findall( './/{http://www.loc.gov/METS/}fileGrp[@USE="OCR-D-GT-ALTO"]') assert not groups_new
def runTest(self): resolver = Resolver() # workspace = resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/mets_one_file.xml'), directory=WORKSPACE_DIR) workspace = resolver.workspace_from_url( assets.url_of('kant_aufklaerung_1784-binarized/mets.xml'), directory=WORKSPACE_DIR) TesserocrSegmentRegion(workspace, input_file_grp="OCR-D-IMG", output_file_grp="OCR-D-SEG-BLOCK").process() TesserocrSegmentLine(workspace, input_file_grp="OCR-D-SEG-BLOCK", output_file_grp="OCR-D-SEG-LINE").process() TesserocrSegmentWord(workspace, input_file_grp="OCR-D-SEG-LINE", output_file_grp="OCR-D-SEG-WORD").process() workspace.save_mets()
def test_download_to_directory_with_badargs(url, basename, exc_msg): with pytest.raises(Exception) as exc: Resolver().download_to_directory(url, basename) # assert exception message contained assert exc_msg in str(exc)
def test_workspace_init_missing_mets(): """Raise Exception when missing mets-file in workspace""" with pytest.raises(Exception) as exc: Workspace(Resolver(), "foo/bar") assert "File does not exist" in str(exc.value)