def test_should_provide_empty_list_for_empty_pages(self): layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[], meta=LayoutPageMeta( page_number=1, coordinates=LAYOUT_PAGE_COORDINATES_1._replace( page_number=1))) ]) result = get_page_numbers_with_mostly_bitmap_graphics(layout_document) assert result == []
def test_should_ignore_small_bitmap(self): layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[], graphics=[ LayoutGraphic( graphic_type='image', coordinates=LAYOUT_PAGE_COORDINATES_1._replace( page_number=1, width=1, height=1)) ], meta=LayoutPageMeta( page_number=1, coordinates=LAYOUT_PAGE_COORDINATES_1._replace( page_number=1))) ]) result = get_page_numbers_with_mostly_bitmap_graphics(layout_document) assert result == []
def _preprocess_layout_graphics( self, layout_document: LayoutDocument, context: FullTextProcessorDocumentContext ) -> LayoutDocument: if not self.config.use_cv_model: return layout_document candidate_page_numbers = sorted( set(get_page_numbers_with_uncommon_page_dimension(layout_document)) - set(get_page_numbers_with_mostly_bitmap_graphics(layout_document)) ) LOGGER.debug('candidate_page_numbers: %r', candidate_page_numbers) if not candidate_page_numbers: return layout_document document_graphic_provider = self._get_document_graphic_provider( context=context, page_numbers=candidate_page_numbers ) semantic_graphics = list( document_graphic_provider.iter_semantic_graphic_for_layout_document( layout_document, extract_graphic_assets=self.config.extract_graphic_assets ) ) if not semantic_graphics: LOGGER.info('no semantic graphics found on pages %r', candidate_page_numbers) return layout_document if not self.config.replace_text_by_cv_graphic: return get_layout_document_with_graphics_replaced_by_graphics( layout_document, semantic_graphics ) return get_layout_document_with_text_and_graphics_replaced_by_graphics( layout_document, semantic_graphics )
def test_should_provide_empty_list_for_empty_document(self): layout_document = LayoutDocument(pages=[]) result = get_page_numbers_with_mostly_bitmap_graphics(layout_document) assert result == []