Python Documentの例、ocrd_browser.model.Document Pythonの例

コード例 #1

0

ファイルを表示

    def __init__(self, document: Document):
        """
        Initializes the underlying ListStore and fills it with a row for each page, then start the lazy loading

        The actual image and data loading happens in _load_row
        """
        columns = {
            self.COLUMN_PAGE_ID: str,
            self.COLUMN_TOOLTIP: str,
            self.COLUMN_FILENAME: str,
            self.COLUMN_THUMB: GdkPixbuf.Pixbuf,
            self.COLUMN_ORDER: int
            # self.COLUMN_HASH: str file hash = filename + modified_time (gets added by LazyLoadingListStore)
        }
        super().__init__(*(columns.values()), init_row=self._init_row, load_row=self._load_row, hash_row=self._hash_row)
        self.document = document
        self.pixbufs: Dict[str, GdkPixbuf.Pixbuf] = {
            icon_name: GdkPixbuf.Pixbuf.new_from_resource(
                '/org/readmachine/ocrd-browser/icons/{}.png'.format(icon_name)
            ) for icon_name in ['page-loading', 'page-missing']
        }

        # TODO: make file_group selectable, see https://github.com/hnesk/browse-ocrd/issues/7#issuecomment-707851109
        self.file_group = document.get_default_image_group(SETTINGS.file_groups.preferred_images)
        file_lookup = document.get_image_paths(self.file_group)
        order = count(start=1)
        for page_id in self.document.page_ids:
            file = file_lookup[page_id]
            self.append((page_id, '', str(file) if file else None, None, next(order)))

        GLib.timeout_add(10, self.start_loading)

コード例 #2

0

ファイルを表示

ファイル: test_document.py プロジェクト: hnesk/browse-ocrd

 def test_get_mime_types(self):
     doc = Document.load(self.path)
     self.assertEqual(
         {
             'application/vnd.prima.page+xml', 'image/tiff',
             'application/alto+xml'
         }, doc.mime_types)

コード例 #3

0

ファイルを表示

    def setUp(self):
        self.root = Gtk.Box(name='container')
        self.win = MagicMock(spec=MainWindow)
        self.win.document = Document.create(self.win)
        self.win.current_page_id = None

        self.vm = ViewManager(self.win, self.root)

コード例 #4

0

ファイルを表示

 def test_path_with_spaces(self):
     doc = Document.load(
         (TEST_BASE_PATH /
          'example/workspaces/heavy quoting/mets.xml').as_uri())
     page = doc.page_for_id('PHYS_0017', 'OCR-D-GT-PAGE')
     image = doc.workspace.image_from_page(page.page, 'PHYS_0017')
     # Assert no exceptions happened and a sensible return value
     self.assertGreater(image[0].height, 100)

コード例 #5

0

ファイルを表示

 def test_missing_image(self):
     path = TEST_BASE_PATH / 'example/workspaces/kant_aufklaerung_1784_missing_image/mets.xml'
     uri = path.as_uri()
     doc = Document.load(uri)
     page = doc.page_for_id('PHYS_0017', 'OCR-D-GT-PAGE')
     image, info, exif = page.get_image(feature_selector='',
                                        feature_filter='binarized')
     # Assert no exceptions happened and no image returned
     self.assertIsNone(image)

コード例 #6

0

ファイルを表示

ファイル: test_document.py プロジェクト: hnesk/browse-ocrd

    def test_save(self):
        doc = Document.clone(self.path)
        with TemporaryDirectory(prefix='browse-ocrd-tests') as directory:
            saved_mets = directory + '/mets.xml'
            doc.save_as(saved_mets)
            saved = Document.load(saved_mets)
            self.assertEqual(doc.file_groups, saved.file_groups)
            self.assertEqual(doc.page_ids, saved.page_ids)
            self.assertEqual(doc.workspace.mets.unique_identifier,
                             saved.workspace.mets.unique_identifier)

            for page_id in doc.page_ids:
                for file_group, mime in doc.file_groups_and_mimetypes:
                    original_file = doc.files_for_page_id(
                        page_id, file_group, mime)[0]
                    saved_file = saved.files_for_page_id(
                        page_id, file_group, mime)[0]
                    self.assertEqual(original_file, saved_file)

コード例 #7

0

ファイルを表示

ファイル: test_document.py プロジェクト: hnesk/browse-ocrd

 def test_path_ocrd_file(self):
     doc = Document.load(self.path)
     image_file = list(
         doc.workspace.mets.find_files(pageId='PHYS_0017',
                                       fileGrp='OCR-D-IMG'))[0]
     self.assertEqual(
         ASSETS_PATH /
         'kant_aufklaerung_1784/data/OCR-D-IMG/INPUT_0017.tif',
         doc.path(image_file))

コード例 #8

0

ファイルを表示

ファイル: test_document.py プロジェクト: hnesk/browse-ocrd

    def test_clone(self):
        doc = Document.clone(self.path)
        self.assertIn('browse-ocrd-clone-', doc.workspace.directory)
        self.assertEqual(str(self.path), doc.baseurl_mets)

        original_files = self.path.parent.rglob('*.*')
        cloned_files = Path(doc.workspace.directory).rglob('*.*')
        for original, cloned in zip(sorted(original_files),
                                    sorted(cloned_files)):
            self.assertEqual(original.read_bytes(), cloned.read_bytes())

コード例 #9

0

ファイルを表示

ファイル: window.py プロジェクト: bertsky/browse-ocrd

    def _open(self, uri: str) -> None:
        # noinspection PyTypeChecker
        self.document = Document.load(uri, emitter=self.emit)
        self.page_list.set_document(self.document)

        self.view_manager.set_document(self.document)
        self.update_ui()

        if len(self.document.page_ids):
            self.on_page_activated(None, self.document.page_ids[0])

コード例 #10

0

ファイルを表示

 def test_page_for_id_with_multiple_images_for_page_and_fileGrp(self):
     """
     returns first image and warns
     """
     doc = Document.load(ASSETS_PATH /
                         'kant_aufklaerung_1784-complex/data/mets.xml')
     # with self.assertLogs('ocrd_browser.model.document', level='WARNING') as log_watch:
     page = doc.page_for_id('PHYS_0017', 'OCR-D-IMG-CLIP')
     self.assertIsInstance(page, Page)
     self.assertIsInstance(page.pc_gts, PcGtsType)

コード例 #11

0

ファイルを表示

ファイル: test_page.py プロジェクト: bertsky/browse-ocrd

 def test_can_call_get_image_if_supported(self):
     page: LazyPage = Document.load(ASSETS_PATH / 'kant_aufklaerung_1784-binarized/data/mets.xml').page_for_id('P_0017', 'OCR-D-GT-WORD')
     if IMAGE_FROM_PAGE_FILENAME_SUPPORT:
         image_by_feature, _, _ = page.get_image(feature_selector={'binarized'}, feature_filter={'cropped'})
         image_by_filename, _, _ = page.get_image(filename='OCR-D-IMG-BIN/BIN_0017.png', feature_filter={'cropped'})
         self.assertEqual(image_by_feature, image_by_filename)
     else:
         try:
             page.get_image(filename='OCR-D-GT-IMG-BIN/PAGE_2019.tif')
             self.fail('IMAGE_FROM_PAGE_FILENAME_SUPPORT detected wrong')
         except RuntimeError as e:
             self.assertTrue(str(e).startswith('Parameter filename not supported in '))

コード例 #12

0

ファイルを表示

ファイル: test_document.py プロジェクト: hnesk/browse-ocrd

 def test_get_page_index(self):
     doc = Document.load(self.path)
     file_index = doc.get_file_index()
     page17 = [
         file for file in file_index.values()
         if file.static_page_id == 'PHYS_0017'
     ]
     alto = [
         file for file in file_index.values()
         if file.mimetype == 'application/alto+xml'
     ]
     self.assertEqual(3, len(page17))
     self.assertEqual(2, len(alto))

コード例 #13

0

ファイルを表示

ファイル: test_document.py プロジェクト: hnesk/browse-ocrd

 def test_page_for_id_with_multiple_images_for_page_and_fileGrp(self):
     """
     returns first image and warns
     """
     doc = Document.load(ASSETS_PATH /
                         'kant_aufklaerung_1784-complex/data/mets.xml')
     with self.assertLogs('ocrd_browser.model.document',
                          level='WARNING') as log_watch:
         page = doc.page_for_id('PHYS_0017', 'OCR-D-IMG-CLIP')
     self.assertIsInstance(page, Page)
     self.assertEqual(1, len(log_watch.records))
     self.assertEqual(
         "No PAGE-XML but 2 images for page 'PHYS_0017' in fileGrp 'OCR-D-IMG-CLIP'",
         log_watch.records[0].msg)

コード例 #14

0

ファイルを表示

ファイル: window.py プロジェクト: bertsky/browse-ocrd

    def __init__(self, **kwargs: Any):
        Gtk.ApplicationWindow.__init__(self, **kwargs)
        # noinspection PyCallByClass,PyArgumentList
        self.set_icon(
            GdkPixbuf.Pixbuf.new_from_resource(
                "/org/readmachine/ocrd-browser/icons/icon.png"))
        self.view_manager = ViewManager(self, self.view_container)
        self.current_page_id: Optional[str] = None
        # noinspection PyTypeChecker
        self.document = Document.create(emitter=self.emit)

        self.actions = ActionRegistry(for_widget=self)
        self.actions.create('close')
        self.actions.create('goto_first')
        self.actions.create('go_back')
        self.actions.create('go_forward')
        self.actions.create('goto_last')
        self.actions.create('page_remove')
        self.actions.create('page_properties')
        self.actions.create('close_view', param_type=GLib.VariantType("s"))
        self.actions.create('split_view', param_type=GLib.VariantType("(ssb)"))
        self.actions.create('create_view', param_type=GLib.VariantType("s"))
        self.actions.create('replace_view',
                            param_type=GLib.VariantType("(ss)"))
        self.actions.create('toggle_edit_mode', state=GLib.Variant('b', False))
        self.actions.create('save')
        self.actions.create('save_as')

        self.connect('delete-event', self.on_delete_event)

        self.page_list = PagePreviewList(self.document)
        self.page_list_scroller.add(self.page_list)
        self.page_list.connect('page_activated', self.on_page_activated)
        self.page_list.connect('pages_selected', self.on_pages_selected)

        for id_, view in self.view_registry.get_view_options().items():
            menu_item = Gtk.ModelButton(visible=True,
                                        centered=False,
                                        halign=Gtk.Align.FILL,
                                        label=view,
                                        hexpand=True)
            menu_item.set_detailed_action_name(
                'win.create_view("{}")'.format(id_))
            self.view_menu_box.pack_start(menu_item, True, True, 0)

        self.view_manager.set_root_view(ViewPage)
        # self.view_manager.split(None, ViewPage, False)

        self.update_ui()

コード例 #15

0

ファイルを表示

ファイル: test_document.py プロジェクト: hnesk/browse-ocrd

    def test_page_for_id_with_no_images_for_page_and_fileGrp(self):
        """
        Issue #4: list index out of range on non-XML fileGrp

        https://github.com/hnesk/browse-ocrd/issues/4
        """
        doc = Document.load(ASSETS_PATH /
                            'kant_aufklaerung_1784-complex/data/mets.xml')
        with self.assertLogs('ocrd_browser.model.document',
                             level='WARNING') as log_watch:
            page = doc.page_for_id('PHYS_0020', 'OCR-D-IMG-CLIP')
        self.assertIsNone(page)
        self.assertEqual(1, len(log_watch.records))
        self.assertEqual(
            "No PAGE-XML and no image for page 'PHYS_0020' in fileGrp 'OCR-D-IMG-CLIP'",
            log_watch.records[0].msg)

コード例 #16

0

ファイルを表示

ファイル: test_document.py プロジェクト: hnesk/browse-ocrd

    def test_page_for_id_with_nothing_for_page_and_fileGrp(self):
        """
        Issue #4 again: This time for missing PAGE-XMLs

        https://github.com/hnesk/browse-ocrd/issues/4
        """
        doc = Document.load(
            ASSETS_PATH /
            '../example/workspaces/kant_aufklaerung_1784_missing_xml/mets.xml')
        with self.assertLogs('ocrd_browser.model.document',
                             level='WARNING') as log_watch:
            page = doc.page_for_id('PHYS_0020', 'OCR-D-GT-PAGE')
        self.assertIsNone(page)
        self.assertEqual(1, len(log_watch.records))
        self.assertEqual(
            "No PAGE-XML and no image for page 'PHYS_0020' in fileGrp 'OCR-D-GT-PAGE'",
            log_watch.records[0].msg)

コード例 #17

0

ファイルを表示

ファイル: test_document.py プロジェクト: hnesk/browse-ocrd

 def test_path_path(self):
     doc = Document.load(self.path)
     self.assertEqual(
         ASSETS_PATH / 'kant_aufklaerung_1784/data/OCR-D-DIR/lala.xml',
         doc.path(Path('OCR-D-DIR/lala.xml')))

コード例 #18

0

ファイルを表示

ファイル: test_document.py プロジェクト: hnesk/browse-ocrd

 def test_path_string(self):
     doc = Document.load(self.path)
     self.assertEqual(ASSETS_PATH / 'kant_aufklaerung_1784/data/lala.xml',
                      doc.path('lala.xml'))

コード例 #19

0

ファイルを表示

ファイル: test_document.py プロジェクト: hnesk/browse-ocrd

 def test_get_default_image_group_with_missing_ocr_d_img(self):
     doc = Document.load(
         ASSETS_PATH / '../example/workspaces/no_ocrd_d_img_group/mets.xml')
     file_group = doc.get_default_image_group()
     self.assertEqual('OCR-D-IMG-PNG', file_group)

コード例 #20

0

ファイルを表示

ファイル: test_document.py プロジェクト: hnesk/browse-ocrd

 def test_get_default_image_group_no_preference(self):
     doc = Document.load(ASSETS_PATH /
                         'kant_aufklaerung_1784-complex/data/mets.xml')
     file_group = doc.get_default_image_group()
     self.assertEqual('OCR-D-IMG', file_group)

コード例 #21

0

ファイルを表示

ファイル: test_document.py プロジェクト: hnesk/browse-ocrd

 def test_get_default_image_group(self):
     doc = Document.load(ASSETS_PATH /
                         'kant_aufklaerung_1784-complex/data/mets.xml')
     file_group = doc.get_default_image_group(
         ['OCR-D-IMG-BIN', 'OCR-D-IMG.*'])
     self.assertEqual('OCR-D-IMG-BIN', file_group)

コード例 #22

0

ファイルを表示

ファイル: test_document.py プロジェクト: hnesk/browse-ocrd

 def test_delete(self):
     doc = Document.clone(self.path)
     doc.delete_page('PHYS_0017')
     self.assertEqual(['PHYS_0020'], doc.page_ids)

コード例 #23

0

ファイルを表示

ファイル: test_document.py プロジェクト: hnesk/browse-ocrd

 def test_get_file_groups_and_mimetypes(self):
     doc = Document.load(self.path)
     expected = [('OCR-D-IMG', 'image/tiff'),
                 ('OCR-D-GT-PAGE', 'application/vnd.prima.page+xml'),
                 ('OCR-D-GT-ALTO', 'application/alto+xml')]
     self.assertEqual(expected, doc.file_groups_and_mimetypes)

コード例 #24

0

ファイルを表示

ファイル: test_document.py プロジェクト: hnesk/browse-ocrd

 def test_modify_when_editable(self):
     doc = Document.clone(self.path)
     doc.reorder(['PHYS_0020', 'PHYS_0017'])

コード例 #25

0

ファイルを表示

ファイル: test_document.py プロジェクト: hnesk/browse-ocrd

 def test_modify_when_not_editable(self):
     doc = Document.load(self.path)
     with self.assertRaises(PermissionError):
         doc.reorder(['PHYS_0020', 'PHYS_0017'])

コード例 #26

0

ファイルを表示

ファイル: test_document.py プロジェクト: hnesk/browse-ocrd

 def test_get_page_ids(self):
     doc = Document.load(self.path)
     self.assertEqual(['PHYS_0017', 'PHYS_0020'], doc.page_ids)

コード例 #27

0

ファイルを表示

ファイル: test_document.py プロジェクト: hnesk/browse-ocrd

 def test_derive_backup_directory(self):
     self.assertEqual(
         Path('/home/jk/.bak.important_project.20200813-184321'),
         Document._derive_backup_directory(
             Path('/home/jk/important_project'),
             datetime(2020, 8, 13, 18, 43, 21)))

コード例 #28

0

ファイルを表示

ファイル: test_document.py プロジェクト: hnesk/browse-ocrd

 def test_reorder(self):
     doc = Document.clone(self.path)
     doc.reorder(['PHYS_0020', 'PHYS_0017'])
     self.assertEqual(['PHYS_0020', 'PHYS_0017'], doc.page_ids)

コード例 #29

0

ファイルを表示

ファイル: test_document.py プロジェクト: hnesk/browse-ocrd

    def test_reorder_with_wrong_ids_raises_value_error(self):
        doc = Document.clone(self.path)
        with self.assertRaises(ValueError) as context:
            doc.reorder(['PHYS_0021', 'PHYS_0017'])

        self.assertIn('page_ids do not match', str(context.exception))

コード例 #30

0

ファイルを表示

ファイル: test_document.py プロジェクト: hnesk/browse-ocrd

 def test_get_image_paths(self):
     doc = Document.load(self.path)
     image_paths = doc.get_image_paths('OCR-D-IMG')
     self.assertEqual(2, len(image_paths))
     self.assertEqual('INPUT_0017.tif', image_paths['PHYS_0017'].name)
     self.assertEqual('INPUT_0020.tif', image_paths['PHYS_0020'].name)