def _match_graphic_elements( self, semantic_graphic_list: Sequence[SemanticGraphic], candidate_semantic_content_list: Sequence[SemanticContentWrapper], unmatched_graphics_container: SemanticMixedContentWrapper ): _graphic_matchers: List[GraphicMatcher] = [ BoundingBoxDistanceGraphicMatcher(), GraphicRelatedBlockTextGraphicMatcher() ] if self.config.use_ocr_model: assert self.fulltext_models.ocr_model _graphic_matchers.append( OpticalCharacterRecognitionGraphicMatcher( ocr_model=self.fulltext_models.ocr_model ) ) graphic_matcher = ChainedGraphicMatcher(_graphic_matchers) graphic_match_result = graphic_matcher.get_graphic_matches( semantic_graphic_list=semantic_graphic_list, candidate_semantic_content_list=candidate_semantic_content_list ) for graphic_match in graphic_match_result.graphic_matches: if isinstance(graphic_match.candidate_semantic_content, SemanticMixedContentWrapper): graphic_match.candidate_semantic_content.add_content( graphic_match.semantic_graphic ) LOGGER.info('unmatched_graphics: %r', graphic_match_result.unmatched_graphics) for unmatched_graphic in graphic_match_result.unmatched_graphics: unmatched_graphics_container.add_content(unmatched_graphic)
def iter_model_layout_documents( self, layout_document: LayoutDocument, document_context: TrainingDataDocumentContext ) -> Iterable[LayoutDocument]: reference_segmenter_model = document_context.fulltext_models.reference_segmenter_model citation_model = document_context.fulltext_models.citation_model segmentation_label_result = get_segmentation_label_result( layout_document, document_context=document_context) references_layout_document = segmentation_label_result.get_filtered_document_by_label( '<references>').remove_empty_blocks() reference_segmenter_labeled_layout_tokens = ( get_labeled_layout_tokens_for_model_and_layout_document( model=reference_segmenter_model, layout_document=references_layout_document, document_context=document_context)) raw_reference_text_list = [ raw_reference_text for raw_reference in SemanticMixedContentWrapper( list( reference_segmenter_model. iter_semantic_content_for_labeled_layout_tokens( reference_segmenter_labeled_layout_tokens)) ).iter_by_type(SemanticRawReference) for raw_reference_text in raw_reference.iter_by_type(SemanticRawReferenceText) ] LOGGER.info('raw_reference_text_list count: %d', len(raw_reference_text_list)) if not raw_reference_text_list: return [] citation_layout_documents = [ LayoutDocument.for_blocks( list(semantic_raw_reference_text.iter_blocks())) for semantic_raw_reference_text in raw_reference_text_list ] citation_labeled_layout_tokens_list = ( get_labeled_layout_tokens_list_for_model_and_layout_documents( model=citation_model, layout_documents=citation_layout_documents, document_context=document_context)) semantic_raw_author_list = [ raw_author for citation_labeled_layout_tokens in citation_labeled_layout_tokens_list for raw_author in SemanticMixedContentWrapper( list( citation_model. iter_semantic_content_for_labeled_layout_tokens( citation_labeled_layout_tokens)) ).iter_by_type_recursively(SemanticRawAuthors) ] LOGGER.info('semantic_raw_author_list count: %d', len(semantic_raw_author_list)) if not semantic_raw_author_list: return [] return [ LayoutDocument.for_blocks([ block for semantic_raw_author in semantic_raw_author_list for block in semantic_raw_author.iter_blocks() ]) ]
def append_semantic_markers_for_layout_block( parent_semantic_content: SemanticMixedContentWrapper, layout_block: LayoutBlock) -> None: semantic_markers = list( iter_semantic_markers_for_layout_block(layout_block)) for semantic_marker in semantic_markers: parent_semantic_content.add_content(semantic_marker)
def test_should_split_marker_on_non_numeric_characters(self): semantic_markers = list( iter_semantic_markers_for_layout_block( LayoutBlock.for_text('+*!'))) semantic_content_wrapper = SemanticMixedContentWrapper( semantic_markers) assert semantic_content_wrapper.view_by_type( SemanticMarker).get_text_list() == ['+', '*', '!'] assert semantic_content_wrapper.merged_block.text == '+*!'
def test_should_not_split_markers_on_digit(self): semantic_markers = list( iter_semantic_markers_for_layout_block( LayoutBlock.for_text('11,12'))) semantic_content_wrapper = SemanticMixedContentWrapper( semantic_markers) assert semantic_content_wrapper.view_by_type( SemanticMarker).get_text_list() == ['11', '12'] assert semantic_content_wrapper.merged_block.text == '11,12'
def test_should_split_markers_on_space(self): semantic_markers = list( iter_semantic_markers_for_layout_block( LayoutBlock.for_text('1 2'))) LOGGER.debug('semantic_markers: %r', semantic_markers) semantic_content_wrapper = SemanticMixedContentWrapper( semantic_markers) assert semantic_content_wrapper.view_by_type( SemanticMarker).get_text_list() == ['1', '2'] assert semantic_content_wrapper.merged_block.text == '1 2'
def iter_filter_layout_document( self, layout_document: LayoutDocument) -> Iterable[LayoutDocument]: header_layout_document = self.filter_layout_document_by_segmentation_label( layout_document, '<header>') labeled_layout_tokens = self.header_model.predict_labels_for_layout_document( header_layout_document, app_features_context=self.app_features_context) LOGGER.debug('labeled_layout_tokens: %r', labeled_layout_tokens) semantic_raw_authors_list = list( SemanticMixedContentWrapper( list( self.header_model. iter_semantic_content_for_labeled_layout_tokens( labeled_layout_tokens))).iter_by_type( SemanticRawAuthors)) LOGGER.info('semantic_raw_authors_list count: %d', len(semantic_raw_authors_list)) LOGGER.info('merge_raw_authors: %s', self.merge_raw_authors) if self.merge_raw_authors: return [ LayoutDocument.for_blocks([ block for semantic_raw_authors in semantic_raw_authors_list for block in semantic_raw_authors.iter_blocks() ]).remove_empty_blocks() ] return [ LayoutDocument.for_blocks(list( semantic_raw_authors.iter_blocks())).remove_empty_blocks() for semantic_raw_authors in semantic_raw_authors_list ]
def iter_model_layout_documents( self, layout_document: LayoutDocument, document_context: TrainingDataDocumentContext ) -> Iterable[LayoutDocument]: fulltext_model = document_context.fulltext_models.fulltext_model segmentation_label_result = get_segmentation_label_result( layout_document, document_context=document_context) body_layout_document = segmentation_label_result.get_filtered_document_by_label( '<body>').remove_empty_blocks() if not body_layout_document.pages: return [] fulltext_labeled_layout_tokens = get_labeled_layout_tokens_for_model_and_layout_document( model=fulltext_model, layout_document=body_layout_document, document_context=document_context) raw_table_list = list( SemanticMixedContentWrapper( list( fulltext_model. iter_semantic_content_for_labeled_layout_tokens( fulltext_labeled_layout_tokens))). iter_by_type_recursively(SemanticRawTable)) LOGGER.info('raw_table_list count: %d', len(raw_table_list)) if not raw_table_list: return [] return [ LayoutDocument.for_blocks(list(raw_table.iter_blocks())) for raw_table in raw_table_list ]
def iter_model_layout_documents( self, layout_document: LayoutDocument, document_context: TrainingDataDocumentContext ) -> Iterable[LayoutDocument]: header_model = document_context.fulltext_models.header_model segmentation_label_result = get_segmentation_label_result( layout_document, document_context=document_context) header_layout_document = segmentation_label_result.get_filtered_document_by_label( '<header>').remove_empty_blocks() LOGGER.debug('header_layout_document: %r', header_layout_document) if not header_layout_document.pages: return [] header_labeled_layout_tokens = get_labeled_layout_tokens_for_model_and_layout_document( model=header_model, layout_document=header_layout_document, document_context=document_context) semantic_raw_author_list = list( SemanticMixedContentWrapper( list( header_model. iter_semantic_content_for_labeled_layout_tokens( header_labeled_layout_tokens))).iter_by_type( SemanticRawAuthors)) LOGGER.info('semantic_raw_author_list count: %d', len(semantic_raw_author_list)) if not semantic_raw_author_list: return [] return [ LayoutDocument.for_blocks([ block for semantic_raw_author in semantic_raw_author_list for block in semantic_raw_author.iter_blocks() ]) ]
def iter_filter_layout_document( self, layout_document: LayoutDocument) -> Iterable[LayoutDocument]: references_layout_document = self.filter_layout_document_by_segmentation_label( layout_document, '<references>') labeled_layout_tokens = self.reference_segmenter_model.predict_labels_for_layout_document( references_layout_document, app_features_context=self.app_features_context) LOGGER.debug('labeled_layout_tokens: %r', labeled_layout_tokens) semantic_raw_references = list( SemanticMixedContentWrapper( list( self.reference_segmenter_model. iter_semantic_content_for_labeled_layout_tokens( labeled_layout_tokens))).iter_by_type( SemanticRawReference)) LOGGER.info('semantic_raw_references count: %d', len(semantic_raw_references)) raw_reference_documents = [ LayoutDocument.for_blocks([ semantic_raw_reference.view_by_type( SemanticRawReferenceText).merged_block ]).remove_empty_blocks() for semantic_raw_reference in semantic_raw_references ] citation_labeled_layout_tokens_list = ( self.citation_model.predict_labels_for_layout_documents( raw_reference_documents, app_features_context=self.app_features_context)) raw_authors = [ raw_author for citation_labeled_layout_tokens in citation_labeled_layout_tokens_list for ref in (self.citation_model. iter_semantic_content_for_labeled_layout_tokens( citation_labeled_layout_tokens)) if isinstance(ref, SemanticReference) for raw_author in ref.iter_by_type(SemanticRawAuthors) ] return [ LayoutDocument.for_blocks([raw_author.merged_block ]).remove_empty_blocks() for raw_author in raw_authors ]
def _process_raw_authors(self, semantic_parent: SemanticMixedContentWrapper): result_content: List[SemanticContentWrapper] = [] raw_authors: List[SemanticRawAuthors] = [] for semantic_content in semantic_parent: if isinstance(semantic_content, SemanticRawAuthors): raw_authors.append(semantic_content) continue result_content.append(semantic_content) if raw_authors: if self.config.merge_raw_authors: raw_authors_layout_documents = [ LayoutDocument.for_blocks([ block for raw_author in raw_authors for block in raw_author.iter_blocks() ]) ] else: raw_authors_layout_documents = [ LayoutDocument.for_blocks(list(raw_author.iter_blocks())) for raw_author in raw_authors ] labeled_layout_tokens_list = self.name_header_model.predict_labels_for_layout_documents( raw_authors_layout_documents, app_features_context=self.app_features_context ) LOGGER.debug('labeled_layout_tokens_list (author): %r', labeled_layout_tokens_list) authors_iterable = ( author for labeled_layout_tokens in labeled_layout_tokens_list for author in ( self.name_header_model.iter_semantic_content_for_labeled_layout_tokens( labeled_layout_tokens ) ) ) for author in authors_iterable: result_content.append(author) semantic_parent.mixed_content = result_content
def test_should_return_empty_list_with_empty_list_of_graphics(self): result = BoundingBoxDistanceGraphicMatcher().get_graphic_matches( semantic_graphic_list=[], candidate_semantic_content_list=[SemanticMixedContentWrapper()]) assert not result