def __init__(self, document: Document): """ Initializes the underlying ListStore and fills it with a row for each page, then start the lazy loading The actual image and data loading happens in _load_row """ columns = { self.COLUMN_PAGE_ID: str, self.COLUMN_TOOLTIP: str, self.COLUMN_FILENAME: str, self.COLUMN_THUMB: GdkPixbuf.Pixbuf, self.COLUMN_ORDER: int # self.COLUMN_HASH: str file hash = filename + modified_time (gets added by LazyLoadingListStore) } super().__init__(*(columns.values()), init_row=self._init_row, load_row=self._load_row, hash_row=self._hash_row) self.document = document self.pixbufs: Dict[str, GdkPixbuf.Pixbuf] = { icon_name: GdkPixbuf.Pixbuf.new_from_resource( '/org/readmachine/ocrd-browser/icons/{}.png'.format(icon_name) ) for icon_name in ['page-loading', 'page-missing'] } # TODO: make file_group selectable, see https://github.com/hnesk/browse-ocrd/issues/7#issuecomment-707851109 self.file_group = document.get_default_image_group(SETTINGS.file_groups.preferred_images) file_lookup = document.get_image_paths(self.file_group) order = count(start=1) for page_id in self.document.page_ids: file = file_lookup[page_id] self.append((page_id, '', str(file) if file else None, None, next(order))) GLib.timeout_add(10, self.start_loading)
def test_get_mime_types(self): doc = Document.load(self.path) self.assertEqual( { 'application/vnd.prima.page+xml', 'image/tiff', 'application/alto+xml' }, doc.mime_types)
def setUp(self): self.root = Gtk.Box(name='container') self.win = MagicMock(spec=MainWindow) self.win.document = Document.create(self.win) self.win.current_page_id = None self.vm = ViewManager(self.win, self.root)
def test_path_with_spaces(self): doc = Document.load( (TEST_BASE_PATH / 'example/workspaces/heavy quoting/mets.xml').as_uri()) page = doc.page_for_id('PHYS_0017', 'OCR-D-GT-PAGE') image = doc.workspace.image_from_page(page.page, 'PHYS_0017') # Assert no exceptions happened and a sensible return value self.assertGreater(image[0].height, 100)
def test_missing_image(self): path = TEST_BASE_PATH / 'example/workspaces/kant_aufklaerung_1784_missing_image/mets.xml' uri = path.as_uri() doc = Document.load(uri) page = doc.page_for_id('PHYS_0017', 'OCR-D-GT-PAGE') image, info, exif = page.get_image(feature_selector='', feature_filter='binarized') # Assert no exceptions happened and no image returned self.assertIsNone(image)
def test_save(self): doc = Document.clone(self.path) with TemporaryDirectory(prefix='browse-ocrd-tests') as directory: saved_mets = directory + '/mets.xml' doc.save_as(saved_mets) saved = Document.load(saved_mets) self.assertEqual(doc.file_groups, saved.file_groups) self.assertEqual(doc.page_ids, saved.page_ids) self.assertEqual(doc.workspace.mets.unique_identifier, saved.workspace.mets.unique_identifier) for page_id in doc.page_ids: for file_group, mime in doc.file_groups_and_mimetypes: original_file = doc.files_for_page_id( page_id, file_group, mime)[0] saved_file = saved.files_for_page_id( page_id, file_group, mime)[0] self.assertEqual(original_file, saved_file)
def test_path_ocrd_file(self): doc = Document.load(self.path) image_file = list( doc.workspace.mets.find_files(pageId='PHYS_0017', fileGrp='OCR-D-IMG'))[0] self.assertEqual( ASSETS_PATH / 'kant_aufklaerung_1784/data/OCR-D-IMG/INPUT_0017.tif', doc.path(image_file))
def test_clone(self): doc = Document.clone(self.path) self.assertIn('browse-ocrd-clone-', doc.workspace.directory) self.assertEqual(str(self.path), doc.baseurl_mets) original_files = self.path.parent.rglob('*.*') cloned_files = Path(doc.workspace.directory).rglob('*.*') for original, cloned in zip(sorted(original_files), sorted(cloned_files)): self.assertEqual(original.read_bytes(), cloned.read_bytes())
def _open(self, uri: str) -> None: # noinspection PyTypeChecker self.document = Document.load(uri, emitter=self.emit) self.page_list.set_document(self.document) self.view_manager.set_document(self.document) self.update_ui() if len(self.document.page_ids): self.on_page_activated(None, self.document.page_ids[0])
def test_page_for_id_with_multiple_images_for_page_and_fileGrp(self): """ returns first image and warns """ doc = Document.load(ASSETS_PATH / 'kant_aufklaerung_1784-complex/data/mets.xml') # with self.assertLogs('ocrd_browser.model.document', level='WARNING') as log_watch: page = doc.page_for_id('PHYS_0017', 'OCR-D-IMG-CLIP') self.assertIsInstance(page, Page) self.assertIsInstance(page.pc_gts, PcGtsType)
def test_can_call_get_image_if_supported(self): page: LazyPage = Document.load(ASSETS_PATH / 'kant_aufklaerung_1784-binarized/data/mets.xml').page_for_id('P_0017', 'OCR-D-GT-WORD') if IMAGE_FROM_PAGE_FILENAME_SUPPORT: image_by_feature, _, _ = page.get_image(feature_selector={'binarized'}, feature_filter={'cropped'}) image_by_filename, _, _ = page.get_image(filename='OCR-D-IMG-BIN/BIN_0017.png', feature_filter={'cropped'}) self.assertEqual(image_by_feature, image_by_filename) else: try: page.get_image(filename='OCR-D-GT-IMG-BIN/PAGE_2019.tif') self.fail('IMAGE_FROM_PAGE_FILENAME_SUPPORT detected wrong') except RuntimeError as e: self.assertTrue(str(e).startswith('Parameter filename not supported in '))
def test_get_page_index(self): doc = Document.load(self.path) file_index = doc.get_file_index() page17 = [ file for file in file_index.values() if file.static_page_id == 'PHYS_0017' ] alto = [ file for file in file_index.values() if file.mimetype == 'application/alto+xml' ] self.assertEqual(3, len(page17)) self.assertEqual(2, len(alto))
def test_page_for_id_with_multiple_images_for_page_and_fileGrp(self): """ returns first image and warns """ doc = Document.load(ASSETS_PATH / 'kant_aufklaerung_1784-complex/data/mets.xml') with self.assertLogs('ocrd_browser.model.document', level='WARNING') as log_watch: page = doc.page_for_id('PHYS_0017', 'OCR-D-IMG-CLIP') self.assertIsInstance(page, Page) self.assertEqual(1, len(log_watch.records)) self.assertEqual( "No PAGE-XML but 2 images for page 'PHYS_0017' in fileGrp 'OCR-D-IMG-CLIP'", log_watch.records[0].msg)
def __init__(self, **kwargs: Any): Gtk.ApplicationWindow.__init__(self, **kwargs) # noinspection PyCallByClass,PyArgumentList self.set_icon( GdkPixbuf.Pixbuf.new_from_resource( "/org/readmachine/ocrd-browser/icons/icon.png")) self.view_manager = ViewManager(self, self.view_container) self.current_page_id: Optional[str] = None # noinspection PyTypeChecker self.document = Document.create(emitter=self.emit) self.actions = ActionRegistry(for_widget=self) self.actions.create('close') self.actions.create('goto_first') self.actions.create('go_back') self.actions.create('go_forward') self.actions.create('goto_last') self.actions.create('page_remove') self.actions.create('page_properties') self.actions.create('close_view', param_type=GLib.VariantType("s")) self.actions.create('split_view', param_type=GLib.VariantType("(ssb)")) self.actions.create('create_view', param_type=GLib.VariantType("s")) self.actions.create('replace_view', param_type=GLib.VariantType("(ss)")) self.actions.create('toggle_edit_mode', state=GLib.Variant('b', False)) self.actions.create('save') self.actions.create('save_as') self.connect('delete-event', self.on_delete_event) self.page_list = PagePreviewList(self.document) self.page_list_scroller.add(self.page_list) self.page_list.connect('page_activated', self.on_page_activated) self.page_list.connect('pages_selected', self.on_pages_selected) for id_, view in self.view_registry.get_view_options().items(): menu_item = Gtk.ModelButton(visible=True, centered=False, halign=Gtk.Align.FILL, label=view, hexpand=True) menu_item.set_detailed_action_name( 'win.create_view("{}")'.format(id_)) self.view_menu_box.pack_start(menu_item, True, True, 0) self.view_manager.set_root_view(ViewPage) # self.view_manager.split(None, ViewPage, False) self.update_ui()
def test_page_for_id_with_no_images_for_page_and_fileGrp(self): """ Issue #4: list index out of range on non-XML fileGrp https://github.com/hnesk/browse-ocrd/issues/4 """ doc = Document.load(ASSETS_PATH / 'kant_aufklaerung_1784-complex/data/mets.xml') with self.assertLogs('ocrd_browser.model.document', level='WARNING') as log_watch: page = doc.page_for_id('PHYS_0020', 'OCR-D-IMG-CLIP') self.assertIsNone(page) self.assertEqual(1, len(log_watch.records)) self.assertEqual( "No PAGE-XML and no image for page 'PHYS_0020' in fileGrp 'OCR-D-IMG-CLIP'", log_watch.records[0].msg)
def test_page_for_id_with_nothing_for_page_and_fileGrp(self): """ Issue #4 again: This time for missing PAGE-XMLs https://github.com/hnesk/browse-ocrd/issues/4 """ doc = Document.load( ASSETS_PATH / '../example/workspaces/kant_aufklaerung_1784_missing_xml/mets.xml') with self.assertLogs('ocrd_browser.model.document', level='WARNING') as log_watch: page = doc.page_for_id('PHYS_0020', 'OCR-D-GT-PAGE') self.assertIsNone(page) self.assertEqual(1, len(log_watch.records)) self.assertEqual( "No PAGE-XML and no image for page 'PHYS_0020' in fileGrp 'OCR-D-GT-PAGE'", log_watch.records[0].msg)
def test_path_path(self): doc = Document.load(self.path) self.assertEqual( ASSETS_PATH / 'kant_aufklaerung_1784/data/OCR-D-DIR/lala.xml', doc.path(Path('OCR-D-DIR/lala.xml')))
def test_path_string(self): doc = Document.load(self.path) self.assertEqual(ASSETS_PATH / 'kant_aufklaerung_1784/data/lala.xml', doc.path('lala.xml'))
def test_get_default_image_group_with_missing_ocr_d_img(self): doc = Document.load( ASSETS_PATH / '../example/workspaces/no_ocrd_d_img_group/mets.xml') file_group = doc.get_default_image_group() self.assertEqual('OCR-D-IMG-PNG', file_group)
def test_get_default_image_group_no_preference(self): doc = Document.load(ASSETS_PATH / 'kant_aufklaerung_1784-complex/data/mets.xml') file_group = doc.get_default_image_group() self.assertEqual('OCR-D-IMG', file_group)
def test_get_default_image_group(self): doc = Document.load(ASSETS_PATH / 'kant_aufklaerung_1784-complex/data/mets.xml') file_group = doc.get_default_image_group( ['OCR-D-IMG-BIN', 'OCR-D-IMG.*']) self.assertEqual('OCR-D-IMG-BIN', file_group)
def test_delete(self): doc = Document.clone(self.path) doc.delete_page('PHYS_0017') self.assertEqual(['PHYS_0020'], doc.page_ids)
def test_get_file_groups_and_mimetypes(self): doc = Document.load(self.path) expected = [('OCR-D-IMG', 'image/tiff'), ('OCR-D-GT-PAGE', 'application/vnd.prima.page+xml'), ('OCR-D-GT-ALTO', 'application/alto+xml')] self.assertEqual(expected, doc.file_groups_and_mimetypes)
def test_modify_when_editable(self): doc = Document.clone(self.path) doc.reorder(['PHYS_0020', 'PHYS_0017'])
def test_modify_when_not_editable(self): doc = Document.load(self.path) with self.assertRaises(PermissionError): doc.reorder(['PHYS_0020', 'PHYS_0017'])
def test_get_page_ids(self): doc = Document.load(self.path) self.assertEqual(['PHYS_0017', 'PHYS_0020'], doc.page_ids)
def test_derive_backup_directory(self): self.assertEqual( Path('/home/jk/.bak.important_project.20200813-184321'), Document._derive_backup_directory( Path('/home/jk/important_project'), datetime(2020, 8, 13, 18, 43, 21)))
def test_reorder(self): doc = Document.clone(self.path) doc.reorder(['PHYS_0020', 'PHYS_0017']) self.assertEqual(['PHYS_0020', 'PHYS_0017'], doc.page_ids)
def test_reorder_with_wrong_ids_raises_value_error(self): doc = Document.clone(self.path) with self.assertRaises(ValueError) as context: doc.reorder(['PHYS_0021', 'PHYS_0017']) self.assertIn('page_ids do not match', str(context.exception))
def test_get_image_paths(self): doc = Document.load(self.path) image_paths = doc.get_image_paths('OCR-D-IMG') self.assertEqual(2, len(image_paths)) self.assertEqual('INPUT_0017.tif', image_paths['PHYS_0017'].name) self.assertEqual('INPUT_0020.tif', image_paths['PHYS_0020'].name)