def test_should_generate_tei_from_model_data(self): layout_document = LayoutDocument.for_blocks([ LayoutBlock(lines=[ get_next_layout_line_for_text(TEXT_1), get_next_layout_line_for_text(TEXT_2) ]) ]) data_generator = get_data_generator() model_data_iterable = data_generator.iter_model_data_for_layout_document( layout_document) training_data_generator = AffiliationAddressTeiTrainingDataGenerator() xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( model_data_iterable) LOGGER.debug('xml: %r', etree.tostring(xml_root)) aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH) assert len(aff_nodes) == 1 lb_nodes = tei_xpath(aff_nodes[0], 'tei:lb') assert len(lb_nodes) == 2 assert lb_nodes[0].getparent().text == TEXT_1 assert lb_nodes[0].tail == '\n' + TEXT_2
def test_should_generate_tei_from_model_data(self): layout_document = LayoutDocument.for_blocks([ LayoutBlock(lines=[ get_next_layout_line_for_text(TEXT_1), get_next_layout_line_for_text(TEXT_2) ]) ]) data_generator = get_data_generator() model_data_iterable = data_generator.iter_model_data_for_layout_document( layout_document) training_data_generator = FullTextTeiTrainingDataGenerator() xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( model_data_iterable) LOGGER.debug('xml: %r', etree.tostring(xml_root)) text_nodes = xml_root.xpath('./text') assert len(text_nodes) == 1 lb_nodes = text_nodes[0].xpath('lb') assert len(lb_nodes) == 2 assert lb_nodes[0].getparent().text == TEXT_1 assert lb_nodes[0].tail == '\n' + TEXT_2
def test_should_provide_repetitive_pattern_feature( self, features_provider: SegmentationLineFeaturesProvider): layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[ LayoutBlock.for_text('this is repetitive'), LayoutBlock.for_text('this is not') ]), LayoutPage(blocks=[ LayoutBlock.for_text('this is repetitive'), LayoutBlock.for_text('it is different') ]) ]) feature_values = [] for features in _iter_line_features(features_provider, layout_document): feature_values.append({ 'get_str_is_repetitive_pattern': (features.get_str_is_repetitive_pattern()), 'get_str_is_first_repetitive_pattern': (features.get_str_is_first_repetitive_pattern()) }) LOGGER.debug('feature_values: %r', feature_values) assert feature_values == [ { 'get_str_is_repetitive_pattern': '1', 'get_str_is_first_repetitive_pattern': '1' }, { 'get_str_is_repetitive_pattern': '0', 'get_str_is_first_repetitive_pattern': '0' }, { 'get_str_is_repetitive_pattern': '1', 'get_str_is_first_repetitive_pattern': '0' }, { 'get_str_is_repetitive_pattern': '0', 'get_str_is_first_repetitive_pattern': '0' }, ]
def test_should_extract_acknowledgement_only( self, fulltext_models_mock: MockFullTextModels): fulltext_processor = FullTextProcessor(fulltext_models_mock) acknowledgment_block = LayoutBlock.for_text('Some acknowledgement') segmentation_model_mock = fulltext_models_mock.segmentation_model_mock fulltext_model_mock = fulltext_models_mock.fulltext_model_mock segmentation_model_mock.update_label_by_layout_block( acknowledgment_block, '<acknowledgement>') fulltext_model_mock.update_label_by_layout_block( acknowledgment_block, '<paragraph>') layout_document = LayoutDocument( pages=[LayoutPage(blocks=[acknowledgment_block])]) semantic_document = fulltext_processor.get_semantic_document_for_layout_document( layout_document=layout_document) assert semantic_document is not None assert semantic_document.back_section.view_by_section_type( SemanticSectionTypes.ACKNOWLEDGEMENT).get_text( ) == acknowledgment_block.text
def test_should_filter_by_token_multiple_labels(self): tagged_tokens = [(TAG_1, get_layout_tokens_for_text('tokens tag 1')), (TAG_2, get_layout_tokens_for_text('tokens tag 2')), (TAG_3, get_layout_tokens_for_text('tokens tag 3'))] line = LayoutLine( [token for _, tokens in tagged_tokens for token in tokens]) layout_model_labels = [ LayoutModelLabel(label=tag, label_token_text=token.text, layout_line=line, layout_token=token) for tag, tokens in tagged_tokens for token in tokens ] layout_document = LayoutDocument( pages=[LayoutPage(blocks=[LayoutBlock(lines=[line])])]) layout_document_label_result = LayoutDocumentLabelResult( layout_document, layout_model_labels) assert join_layout_tokens( layout_document_label_result.get_filtered_document_by_labels([ TAG_1, TAG_3 ]).iter_all_tokens()) == join_layout_tokens(tagged_tokens[0][1] + tagged_tokens[2][1])
def test_should_extract_author_names_separated_by_another_tag( self, fulltext_models_mock: MockFullTextModels): given_name_block = LayoutBlock.for_text('Given name') surname_block = LayoutBlock.for_text('Surname') other_block = LayoutBlock.for_text('Other') authors_block = LayoutBlock.merge_blocks( [given_name_block, other_block, surname_block]) fulltext_processor = FullTextProcessor( fulltext_models_mock, config=FullTextProcessorConfig(merge_raw_authors=True)) header_block = authors_block segmentation_model_mock = fulltext_models_mock.segmentation_model_mock header_model_mock = fulltext_models_mock.header_model_mock name_header_model_mock = fulltext_models_mock.name_header_model_mock segmentation_model_mock.update_label_by_layout_block( header_block, '<header>') header_model_mock.update_label_by_layout_block(given_name_block, '<author>') header_model_mock.update_label_by_layout_block(surname_block, '<author>') name_header_model_mock.update_label_by_layout_block( given_name_block, '<forename>') name_header_model_mock.update_label_by_layout_block( surname_block, '<surname>') layout_document = LayoutDocument( pages=[LayoutPage(blocks=[header_block])]) semantic_document = fulltext_processor.get_semantic_document_for_layout_document( layout_document=layout_document) assert semantic_document is not None authors = semantic_document.front.authors assert len(authors) == 1 assert authors[0].given_name_text == given_name_block.text assert authors[0].surname_text == surname_block.text
def iter_filter_layout_document( self, layout_document: LayoutDocument) -> Iterable[LayoutDocument]: header_layout_document = self.filter_layout_document_by_segmentation_label( layout_document, '<header>') labeled_layout_tokens = self.header_model.predict_labels_for_layout_document( header_layout_document, app_features_context=self.app_features_context) LOGGER.debug('labeled_layout_tokens: %r', labeled_layout_tokens) semantic_raw_aff_address_list = list( SemanticMixedContentWrapper( list( self.header_model. iter_semantic_content_for_labeled_layout_tokens( labeled_layout_tokens))).iter_by_type( SemanticRawAffiliationAddress)) LOGGER.info('semantic_raw_aff_address_list count: %d', len(semantic_raw_aff_address_list)) return [ LayoutDocument.for_blocks( list(semantic_raw_aff_address.iter_blocks())). remove_empty_blocks() for semantic_raw_aff_address in semantic_raw_aff_address_list ]
def test_should_extract_raw_references_from_document( # pylint: disable=too-many-locals self, fulltext_models_mock: MockFullTextModels): label_block = LayoutBlock.for_text('1') ref_text_block = LayoutBlock.for_text('Reference 1') ref_block = LayoutBlock.merge_blocks([label_block, ref_text_block]) fulltext_processor = FullTextProcessor( fulltext_models_mock, FullTextProcessorConfig(extract_citation_fields=False)) segmentation_model_mock = fulltext_models_mock.segmentation_model_mock reference_segmenter_model_mock = fulltext_models_mock.reference_segmenter_model_mock segmentation_model_mock.update_label_by_layout_block( ref_block, '<references>') reference_segmenter_model_mock.update_label_by_layout_block( label_block, '<label>') reference_segmenter_model_mock.update_label_by_layout_block( ref_text_block, '<reference>') layout_document = LayoutDocument( pages=[LayoutPage(blocks=[ref_block])]) semantic_document = fulltext_processor.get_semantic_document_for_layout_document( layout_document=layout_document) LOGGER.debug('semantic_document: %s', semantic_document) assert semantic_document is not None assert semantic_document.back_section.get_text() == ref_block.text reference_list = list( semantic_document.back_section.iter_by_type(SemanticReferenceList)) assert len(reference_list) == 1 references = list(reference_list[0].iter_by_type(SemanticRawReference)) assert len(references) == 1 ref = references[0] assert ref.get_text_by_type(SemanticLabel) == label_block.text assert ref.get_text_by_type( SemanticRawReferenceText) == ref_text_block.text assert ref.content_id == 'b0'
def test_should_not_merge_separate_raw_affiliations( # pylint: disable=too-many-locals self, fulltext_models_mock: MockFullTextModels): aff_suffix_texts = ['1', '2'] institution_blocks = [ LayoutBlock.for_text(f'Institution{t}') for t in aff_suffix_texts ] aff_blocks = institution_blocks aff_address_blocks = aff_blocks fulltext_processor = FullTextProcessor(fulltext_models_mock) header_block = LayoutBlock.merge_blocks(aff_address_blocks) segmentation_model_mock = fulltext_models_mock.segmentation_model_mock header_model_mock = fulltext_models_mock.header_model_mock affiliation_address_model_mock = fulltext_models_mock.affiliation_address_model_mock segmentation_model_mock.update_label_by_layout_block( header_block, '<header>') for aff_block in aff_blocks: header_model_mock.update_label_by_layout_block( aff_block, '<affiliation>') for institution_block in institution_blocks: affiliation_address_model_mock.update_label_by_layout_block( institution_block, '<institution>') layout_document = LayoutDocument( pages=[LayoutPage(blocks=[header_block])]) semantic_document = fulltext_processor.get_semantic_document_for_layout_document( layout_document=layout_document) assert semantic_document is not None affiliations = list( semantic_document.front.iter_by_type(SemanticAffiliationAddress)) LOGGER.debug('affiliations: %r', affiliations) assert ([aff.get_text() for aff in affiliations ] == [aff_block.text for aff_block in aff_blocks]) assert ([aff.content_id for aff in affiliations] == ['aff0', 'aff1'])
def get_layout_document_with_text_or_graphic_replaced_by_graphics( layout_document: LayoutDocument, semantic_graphics: Iterable[SemanticGraphic], is_replace_overlapping_text: bool) -> LayoutDocument: page_by_page_number = { page.meta.page_number: page for page in layout_document.pages if page.meta } LOGGER.debug('page_by_page_number.keys: %r', page_by_page_number.keys()) has_changes = False semantic_graphics_list = list(semantic_graphics) semantic_graphic_count_by_page = Counter( (semantic_graphic.layout_graphic.coordinates.page_number for semantic_graphic in semantic_graphics_list if (semantic_graphic.layout_graphic and semantic_graphic.layout_graphic.coordinates))) for semantic_graphic in semantic_graphics_list: layout_graphic = semantic_graphic.layout_graphic assert layout_graphic if not layout_graphic.coordinates: continue page_number = layout_graphic.coordinates.page_number page_by_page_number[page_number] = ( get_layout_page_with_text_or_graphic_replaced_by_graphic( page_by_page_number[page_number], semantic_graphic, is_only_semantic_graphic_on_page=( semantic_graphic_count_by_page[page_number] < 2), is_replace_overlapping_text=is_replace_overlapping_text)) has_changes = True if not has_changes: return layout_document pages = [ (page_by_page_number[page.meta.page_number] if page.meta else page) for page in layout_document.pages ] return layout_document.replace(pages=pages)
def _process_raw_affiliations(self, semantic_document: SemanticDocument): result_content: List[SemanticContentWrapper] = [] raw_aff_address_list: List[SemanticRawAffiliationAddress] = [] for semantic_content in semantic_document.front: if isinstance(semantic_content, SemanticRawAffiliationAddress): raw_aff_address_list.append(semantic_content) continue result_content.append(semantic_content) if raw_aff_address_list: raw_aff_layout_documents = [ LayoutDocument.for_blocks(list(raw_aff_or_address.iter_blocks())) for raw_aff_or_address in raw_aff_address_list ] labeled_layout_tokens_list = ( self.affiliation_address_model .predict_labels_for_layout_documents( raw_aff_layout_documents, app_features_context=self.app_features_context ) ) LOGGER.debug('labeled_layout_tokens_list (aff): %r', labeled_layout_tokens_list) aff_iterable = ( aff for labeled_layout_tokens in labeled_layout_tokens_list for aff in ( self.affiliation_address_model .iter_semantic_content_for_labeled_layout_tokens(labeled_layout_tokens) ) ) for aff in aff_iterable: result_content.append(aff) semantic_document.front.mixed_content = result_content self._assign_content_ids( semantic_document.front.iter_by_type(SemanticAffiliationAddress), iter(iter_ids('aff')) )
def test_should_provide_punctuation_profile( self, features_provider: SegmentationLineFeaturesProvider): layout_document = LayoutDocument(pages=[ LayoutPage( blocks=[LayoutBlock(lines=[ LayoutLine.for_text('a .: b'), ])]) ]) feature_values = [] for features in _iter_line_features(features_provider, layout_document): feature_values.append({ 'line_punctuation_profile': features.get_line_punctuation_profile(), 'line_punctuation_profile_length_feature': (features.get_line_punctuation_profile_length_feature()), }) LOGGER.debug('feature_values: %r', feature_values) assert feature_values == [ { 'line_punctuation_profile': '.:', 'line_punctuation_profile_length_feature': '2' }, ]
def test_should_extract_author_names_from_document( self, fulltext_models_mock: MockFullTextModels): given_name_block = LayoutBlock.for_text('Given name') surname_block = LayoutBlock.for_text('Surname') authors_block = LayoutBlock.merge_blocks( [given_name_block, surname_block]) fulltext_processor = FullTextProcessor(fulltext_models_mock) header_block = authors_block segmentation_model_mock = fulltext_models_mock.segmentation_model_mock header_model_mock = fulltext_models_mock.header_model_mock name_header_model_mock = fulltext_models_mock.name_header_model_mock segmentation_model_mock.update_label_by_layout_block( header_block, '<header>') header_model_mock.update_label_by_layout_block(authors_block, '<author>') name_header_model_mock.update_label_by_layout_block( given_name_block, '<forename>') name_header_model_mock.update_label_by_layout_block( surname_block, '<surname>') layout_document = LayoutDocument( pages=[LayoutPage(blocks=[header_block])]) semantic_document = fulltext_processor.get_semantic_document_for_layout_document( layout_document=layout_document) assert semantic_document is not None assert semantic_document.front.get_text() == authors_block.text assert (semantic_document.front.view_by_type( SemanticAuthor).get_text()) == authors_block.text authors = semantic_document.front.authors assert len(authors) == 1 assert authors[0].given_name_text == given_name_block.text assert authors[0].surname_text == surname_block.text
def iter_filter_layout_document( self, layout_document: LayoutDocument) -> Iterable[LayoutDocument]: references_layout_document = self.filter_layout_document_by_segmentation_label( layout_document, '<references>') labeled_layout_tokens = self.reference_segmenter_model.predict_labels_for_layout_document( references_layout_document, app_features_context=self.app_features_context) LOGGER.debug('labeled_layout_tokens: %r', labeled_layout_tokens) semantic_raw_references = list( SemanticMixedContentWrapper( list( self.reference_segmenter_model. iter_semantic_content_for_labeled_layout_tokens( labeled_layout_tokens))).iter_by_type( SemanticRawReference)) LOGGER.info('semantic_raw_references count: %d', len(semantic_raw_references)) return [ LayoutDocument.for_blocks([ semantic_raw_reference.view_by_type( SemanticRawReferenceText).merged_block ]).remove_empty_blocks() for semantic_raw_reference in semantic_raw_references ]
def test_should_provide_empty_list_for_empty_document(self): layout_document = LayoutDocument(pages=[]) result = get_page_numbers_with_mostly_bitmap_graphics(layout_document) assert result == []
def test_should_provide_empty_list_for_empty_document(self): layout_document = LayoutDocument(pages=[]) result = get_page_numbers_with_uncommon_page_dimension(layout_document) assert result == []
def _get_layout_document_for_layout_graphic( layout_graphic: LayoutGraphic) -> LayoutDocument: return LayoutDocument( pages=[LayoutPage(blocks=[], graphics=[layout_graphic])])
def test_should_not_change_layout_document_if_semantic_graphics_is_empty( self): layout_document = LayoutDocument(pages=[]) result = get_layout_document_with_graphics_replaced_by_graphics( layout_document, semantic_graphics=[]) assert result == layout_document
def test_should_extract_figure_label_caption_from_body( # pylint: disable=too-many-locals self, fulltext_models_mock: MockFullTextModels, segmentation_label: str): citation_block = LayoutBlock.for_text('Figure 1') _coordinates = LayoutPageCoordinates(x=10, y=10, width=100, height=10) graphic_local_file_path = '/path/to/graphic1.svg' graphic = LayoutGraphic(coordinates=_coordinates, local_file_path=graphic_local_file_path) _coordinates = _coordinates.move_by(dy=10) label_block = LayoutBlock.for_text('Figure 1', coordinates=_coordinates) _coordinates = _coordinates.move_by(dy=10) caption_block = LayoutBlock.for_text('Caption 1', coordinates=_coordinates) other_block = LayoutBlock.for_text('Other') figure_block = LayoutBlock.merge_blocks( [label_block, other_block, caption_block]) fulltext_block = LayoutBlock.merge_blocks( [citation_block, figure_block]) fulltext_processor = FullTextProcessor( fulltext_models_mock, FullTextProcessorConfig(extract_figure_fields=True, extract_graphic_bounding_boxes=True, extract_graphic_assets=True)) segmentation_model_mock = fulltext_models_mock.segmentation_model_mock fulltext_model_mock = fulltext_models_mock.fulltext_model_mock figure_model_mock = fulltext_models_mock.figure_model_mock segmentation_model_mock.update_label_by_layout_block( fulltext_block, segmentation_label) fulltext_model_mock.update_label_by_layout_block( citation_block, '<figure_marker>') fulltext_model_mock.update_label_by_layout_block( figure_block, '<figure>') figure_model_mock.update_label_by_layout_block(label_block, '<label>') figure_model_mock.update_label_by_layout_block(caption_block, '<figDesc>') layout_document = LayoutDocument( pages=[LayoutPage(blocks=[fulltext_block], graphics=[graphic])]) semantic_document = fulltext_processor.get_semantic_document_for_layout_document( layout_document=layout_document) LOGGER.debug('semantic_document: %s', semantic_document) assert semantic_document is not None figure_list = list( iter_by_semantic_type_recursively([ semantic_document.body_section, semantic_document.back_section ], SemanticFigure)) assert len(figure_list) == 1 figure = figure_list[0] assert figure.get_text_by_type(SemanticLabel) == label_block.text assert figure.get_text_by_type(SemanticCaption) == caption_block.text assert figure.content_id == 'fig_0' figure_citation_list = list( semantic_document.iter_by_type_recursively(SemanticFigureCitation)) assert len(figure_citation_list) == 1 assert figure_citation_list[0].get_text() == citation_block.text assert figure_citation_list[0].target_content_id == 'fig_0' semantic_graphic_list = list(figure.iter_by_type(SemanticGraphic)) assert semantic_graphic_list assert semantic_graphic_list[0].layout_graphic == graphic assert semantic_graphic_list[0].relative_path == os.path.basename( graphic_local_file_path)
def normalize_layout_document(layout_document: LayoutDocument, **kwargs) -> LayoutDocument: return (layout_document.retokenize( tokenize_fn=normalize_and_tokenize_text).remove_empty_blocks(**kwargs))
def iter_semantic_graphic_for_layout_document( self, layout_document: LayoutDocument, extract_graphic_assets: bool) -> Iterable[SemanticGraphic]: return get_semantic_graphic_list_for_layout_graphic_list( layout_document.iter_all_graphics(), extract_graphic_assets=extract_graphic_assets)
def iter_line_features( # pylint: disable=too-many-locals self, layout_document: LayoutDocument ) -> Iterable[SegmentationLineFeatures]: segmentation_line_features = SegmentationLineFeatures( document_features_context=self.document_features_context) previous_token: Optional[LayoutToken] = None segmentation_line_features.document_token_count = sum( len(line.tokens) for block in layout_document.iter_all_blocks() for line in block.lines) pattern_candididate_block_iterable = ( block for page in layout_document.pages for block_index, block in enumerate(page.blocks) if block_index < 2 or block_index > len(page.blocks) - 2) pattern_candididate_line_iterable = ( block.lines[0] for block in pattern_candididate_block_iterable if block.lines and block.lines[0].tokens) all_pattern_by_line_id = { id(line): get_text_pattern(line.text) for line in pattern_candididate_line_iterable } LOGGER.debug('all_pattern_by_line_id: %s', all_pattern_by_line_id) pattern_by_line_id = { key: value for key, value in all_pattern_by_line_id.items() if len(value) >= 8 # Java GROBID sometimes counts an additional trailing space } pattern_counter = Counter(pattern_by_line_id.values()) LOGGER.debug('pattern_counter: %s', pattern_counter) seen_repetitive_patterns: Set[str] = set() document_token_index = 0 for page in layout_document.pages: blocks = page.blocks segmentation_line_features.page_blocks = blocks for block_index, block in enumerate(blocks): segmentation_line_features.page_block_index = block_index block_lines = block.lines segmentation_line_features.block_lines = block_lines block_line_texts = [line.text for line in block_lines] max_block_line_text_length = max( len(text) for text in block_line_texts) first_block_token = next(iter(block.iter_all_tokens()), None) assert first_block_token for line_index, line in enumerate(block_lines): segmentation_line_features.document_token_index = document_token_index document_token_index += len(line.tokens) segmentation_line_features.layout_line = line segmentation_line_features.block_line_index = line_index segmentation_line_features.max_block_line_text_length = ( max_block_line_text_length) line_text = block_line_texts[line_index] retokenized_token_texts = re.split(r" |\t|\f|\u00A0", line_text) if not retokenized_token_texts: continue if self.use_first_token_of_block: # Java GROBID uses the first token in the block token = first_block_token else: token = line.tokens[0] segmentation_line_features.layout_token = token segmentation_line_features.line_text = line_text segmentation_line_features.concatenated_line_tokens_text = line_text segmentation_line_features.token_text = retokenized_token_texts[ 0].strip() segmentation_line_features.second_token_text = ( retokenized_token_texts[1] if len(retokenized_token_texts) >= 2 else '') segmentation_line_features.previous_layout_token = previous_token line_pattern = pattern_by_line_id.get(id(line), '') LOGGER.debug('line_pattern: %r', line_pattern) segmentation_line_features.is_repetitive_pattern = ( pattern_counter[line_pattern] > 1) segmentation_line_features.is_first_repetitive_pattern = ( segmentation_line_features.is_repetitive_pattern and line_pattern not in seen_repetitive_patterns) if segmentation_line_features.is_first_repetitive_pattern: seen_repetitive_patterns.add(line_pattern) yield segmentation_line_features previous_token = token