def test_should_provide_empty_list_for_empty_pages(self):
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[],
                    meta=LayoutPageMeta(
                        page_number=1,
                        coordinates=LAYOUT_PAGE_COORDINATES_1._replace(
                            page_number=1)))
     ])
     result = get_page_numbers_with_mostly_bitmap_graphics(layout_document)
     assert result == []
 def test_should_ignore_small_bitmap(self):
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[],
                    graphics=[
                        LayoutGraphic(
                            graphic_type='image',
                            coordinates=LAYOUT_PAGE_COORDINATES_1._replace(
                                page_number=1, width=1, height=1))
                    ],
                    meta=LayoutPageMeta(
                        page_number=1,
                        coordinates=LAYOUT_PAGE_COORDINATES_1._replace(
                            page_number=1)))
     ])
     result = get_page_numbers_with_mostly_bitmap_graphics(layout_document)
     assert result == []
Example #3
0
 def _preprocess_layout_graphics(
     self,
     layout_document: LayoutDocument,
     context: FullTextProcessorDocumentContext
 ) -> LayoutDocument:
     if not self.config.use_cv_model:
         return layout_document
     candidate_page_numbers = sorted(
         set(get_page_numbers_with_uncommon_page_dimension(layout_document))
         - set(get_page_numbers_with_mostly_bitmap_graphics(layout_document))
     )
     LOGGER.debug('candidate_page_numbers: %r', candidate_page_numbers)
     if not candidate_page_numbers:
         return layout_document
     document_graphic_provider = self._get_document_graphic_provider(
         context=context,
         page_numbers=candidate_page_numbers
     )
     semantic_graphics = list(
         document_graphic_provider.iter_semantic_graphic_for_layout_document(
             layout_document,
             extract_graphic_assets=self.config.extract_graphic_assets
         )
     )
     if not semantic_graphics:
         LOGGER.info('no semantic graphics found on pages %r', candidate_page_numbers)
         return layout_document
     if not self.config.replace_text_by_cv_graphic:
         return get_layout_document_with_graphics_replaced_by_graphics(
             layout_document,
             semantic_graphics
         )
     return get_layout_document_with_text_and_graphics_replaced_by_graphics(
         layout_document,
         semantic_graphics
     )
 def test_should_provide_empty_list_for_empty_document(self):
     layout_document = LayoutDocument(pages=[])
     result = get_page_numbers_with_mostly_bitmap_graphics(layout_document)
     assert result == []