def runTest(self): resolver = Resolver() # workspace = resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/mets_one_file.xml'), directory=WORKSPACE_DIR) workspace = resolver.workspace_from_url( assets.url_of('kant_aufklaerung_1784-binarized/mets.xml'), directory=WORKSPACE_DIR) TesserocrSegmentRegion(workspace, input_file_grp="OCR-D-IMG", output_file_grp="OCR-D-SEG-BLOCK").process() TesserocrSegmentLine(workspace, input_file_grp="OCR-D-SEG-BLOCK", output_file_grp="OCR-D-SEG-LINE").process() TesserocrSegmentWord(workspace, input_file_grp="OCR-D-SEG-LINE", output_file_grp="OCR-D-SEG-WORD").process() workspace.save_mets()
def runTest(self): resolver = Resolver(cache_enabled=True) # workspace = resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/mets_one_file.xml'), directory=WORKSPACE_DIR) workspace = resolver.workspace_from_url(assets.url_of( 'kant_aufklaerung_1784-page-block-line-word/mets.xml'), directory=WORKSPACE_DIR) TesserocrSegmentRegion(workspace, input_file_grp="OCR-D-IMG", output_file_grp="OCR-D-SEG-BLOCK").process() workspace.save_mets() TesserocrSegmentLine(workspace, input_file_grp="OCR-D-SEG-BLOCK", output_file_grp="OCR-D-SEG-LINE").process() workspace.save_mets() TesserocrRecognize(workspace, input_file_grp="OCR-D-SEG-LINE", output_file_grp="OCR-D-OCR-TESS", parameter={ 'textequiv_level': 'word' }).process() workspace.save_mets()
def runTest(self): resolver = Resolver() # workspace = resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/mets_one_file.xml'), directory=WORKSPACE_DIR) workspace = resolver.workspace_from_url(assets.url_of( 'kant_aufklaerung_1784-page-block-line-word/mets.xml'), directory=WORKSPACE_DIR) TesserocrSegmentRegion(workspace, input_file_grp="OCR-D-IMG", output_file_grp="OCR-D-SEG-BLOCK").process() workspace.save_mets() TesserocrSegmentLine(workspace, input_file_grp="OCR-D-SEG-BLOCK", output_file_grp="OCR-D-SEG-LINE").process() workspace.save_mets() TesserocrRecognize( workspace, input_file_grp="OCR-D-SEG-LINE", output_file_grp="OCR-D-OCR-TESS", parameter={ 'textequiv_level': 'line' } # add dep tesseract-ocr-script-frak: , 'model': 'Fraktur' ).process() workspace.save_mets() TesserocrSegmentWord(workspace, input_file_grp="OCR-D-SEG-LINE", output_file_grp="OCR-D-SEG-WORD").process() workspace.save_mets() TesserocrRecognize( workspace, input_file_grp="OCR-D-SEG-WORD", output_file_grp="OCR-D-OCR-TESS-W2C", parameter={ 'textequiv_level': 'glyph' } # add dep tesseract-ocr-script-frak: , 'model': 'Fraktur'} ).process() workspace.save_mets()
import os import shutil from test.base import TestCase, main, assets from ocrd.resolver import Resolver from ocrd_tesserocr import TesserocrSegmentRegion METS_HEROLD_SMALL = assets.url_of('SBB0000F29300010000/data/mets_one_file.xml') WORKSPACE_DIR = '/tmp/pyocrd-test-segment-region-tesserocr' class TestTesserocrSegmentRegionTesseract(TestCase): def setUp(self): if os.path.exists(WORKSPACE_DIR): shutil.rmtree(WORKSPACE_DIR) os.makedirs(WORKSPACE_DIR) def runTest(self): resolver = Resolver() workspace = resolver.workspace_from_url(METS_HEROLD_SMALL, dst_dir=WORKSPACE_DIR) TesserocrSegmentRegion(workspace, input_file_grp="OCR-D-IMG", output_file_grp="OCR-D-SEG-BLOCK").process() workspace.save_mets() if __name__ == '__main__': main()
# pylint: disable=import-error import os import shutil from tempfile import TemporaryDirectory from test.base import TestCase, assets, main from ocrd.resolver import Resolver from ocrd_ocropy.segment import OcropySegment PARAM_JSON = assets.url_of('param-segment.json') WORKSPACE_DIR = '/tmp/ocrd-ocropy-segment-test' class TestOcropySegment(TestCase): def setUp(self): if os.path.exists(WORKSPACE_DIR): shutil.rmtree(WORKSPACE_DIR) os.makedirs(WORKSPACE_DIR) def test_run1(self): resolver = Resolver() with TemporaryDirectory() as tempdir: workspace = resolver.workspace_from_url(assets.path_to( 'kant_aufklaerung_1784-binarized/data/mets.xml'), dst_dir=tempdir) proc = OcropySegment( workspace, input_file_grp="OCR-D-IMG-BIN",
def setUp(self): self.mets = OcrdMets( filename=assets.url_of('SBB0000F29300010000/mets.xml'))
def setUp(self): self.resolver = Resolver() self.workspace = self.resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/mets.xml'))
import os import shutil from test.base import TestCase, main, assets, skip from ocrd.resolver import Resolver from ocrd_tesserocr.segment_word import TesserocrSegmentWord from ocrd_tesserocr.segment_line import TesserocrSegmentLine from ocrd_tesserocr.segment_region import TesserocrSegmentRegion from ocrd_tesserocr.recognize import TesserocrRecognize #METS_HEROLD_SMALL = assets.url_of('SBB0000F29300010000/data/mets_one_file.xml') METS_HEROLD_SMALL = assets.url_of('kant_aufklaerung_1784/data/mets.xml') WORKSPACE_DIR = '/tmp/pyocrd-test-recognizer' class TestTesserocrRecognize(TestCase): def setUp(self): if os.path.exists(WORKSPACE_DIR): shutil.rmtree(WORKSPACE_DIR) os.makedirs(WORKSPACE_DIR) #skip("Takes too long") def runTest(self): resolver = Resolver() workspace = resolver.workspace_from_url(METS_HEROLD_SMALL, dst_dir=WORKSPACE_DIR) TesserocrSegmentRegion(workspace, input_file_grp="OCR-D-IMG", output_file_grp="OCR-D-SEG-BLOCK").process()
def runTest(self): report = WorkspaceValidator.validate_url( self.resolver, assets.url_of('SBB0000F29300010000/mets_one_file.xml')) print(report.to_xml())
def runTest(self): resolver = Resolver(cache_enabled=True) workspace = resolver.workspace_from_url( assets.url_of('SBB0000F29300010000/mets_one_file.xml')) TesserocrSegmentRegion(workspace).process() workspace.save_mets()
import os from shutil import copytree, rmtree from ocrd.model import OcrdExif from ocrd.resolver import Resolver from test.base import TestCase, assets, main TMP_FOLDER = '/tmp/test-pyocrd-resolver' METS_HEROLD = assets.url_of('SBB0000F29300010000/mets.xml') FOLDER_KANT = assets.url_of('kant_aufklaerung_1784')[len('file://'):] TEST_ZIP = assets.url_of('test.ocrd.zip')[len('file://'):] class TestResolver(TestCase): def setUp(self): self.resolver = Resolver(cache_enabled=True) self.folder = os.path.join(TMP_FOLDER, 'kant_aufklaerung_1784') if os.path.exists(TMP_FOLDER): rmtree(TMP_FOLDER) os.makedirs(TMP_FOLDER) copytree(FOLDER_KANT, self.folder) def test_unpack_workspace(self): workspace = self.resolver.unpack_workspace_from_filename(TEST_ZIP) files = workspace.mets.find_files(mimetype='image/tiff') self.assertEqual(len(files), 2, '2 TIF') for f in files: workspace.download_file(f) print([OcrdExif.from_filename(f.local_filename).to_xml() for f in files]) def test_workspace_from_folder(self):
import os from shutil import copytree, rmtree from test.base import TestCase, assets, main from ocrd.model import OcrdExif from ocrd.resolver import Resolver TMP_FOLDER = '/tmp/test-pyocrd-resolver' METS_HEROLD = assets.url_of('SBB0000F29300010000/mets.xml') FOLDER_KANT = assets.path_to('kant_aufklaerung_1784') TEST_ZIP = assets.path_to('test.ocrd.zip') class TestResolver(TestCase): def setUp(self): self.resolver = Resolver() self.folder = os.path.join(TMP_FOLDER, 'kant_aufklaerung_1784') if os.path.exists(TMP_FOLDER): rmtree(TMP_FOLDER) os.makedirs(TMP_FOLDER) copytree(FOLDER_KANT, self.folder) def test_workspace_from_url(self): workspace = self.resolver.workspace_from_url(METS_HEROLD) # print(METS_HEROLD) # print(workspace.mets) input_files = workspace.mets.find_files(fileGrp='OCR-D-IMG') # print [str(f) for f in input_files] image_file = input_files[0] # print(image_file) f = workspace.download_file(image_file)