Example #1
0
 def test_should_generate_tei_from_model_data(self):
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(lines=[
             get_next_layout_line_for_text(TEXT_1),
             get_next_layout_line_for_text(TEXT_2)
         ])
     ])
     data_generator = get_data_generator()
     model_data_iterable = data_generator.iter_model_data_for_layout_document(
         layout_document)
     training_data_generator = AffiliationAddressTeiTrainingDataGenerator()
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         model_data_iterable)
     LOGGER.debug('xml: %r', etree.tostring(xml_root))
     aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH)
     assert len(aff_nodes) == 1
     lb_nodes = tei_xpath(aff_nodes[0], 'tei:lb')
     assert len(lb_nodes) == 2
     assert lb_nodes[0].getparent().text == TEXT_1
     assert lb_nodes[0].tail == '\n' + TEXT_2
Example #2
0
 def test_should_generate_tei_from_model_data(self):
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(lines=[
             get_next_layout_line_for_text(TEXT_1),
             get_next_layout_line_for_text(TEXT_2)
         ])
     ])
     data_generator = get_data_generator()
     model_data_iterable = data_generator.iter_model_data_for_layout_document(
         layout_document)
     training_data_generator = FullTextTeiTrainingDataGenerator()
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         model_data_iterable)
     LOGGER.debug('xml: %r', etree.tostring(xml_root))
     text_nodes = xml_root.xpath('./text')
     assert len(text_nodes) == 1
     lb_nodes = text_nodes[0].xpath('lb')
     assert len(lb_nodes) == 2
     assert lb_nodes[0].getparent().text == TEXT_1
     assert lb_nodes[0].tail == '\n' + TEXT_2
Example #3
0
 def test_should_provide_repetitive_pattern_feature(
         self, features_provider: SegmentationLineFeaturesProvider):
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock.for_text('this is repetitive'),
             LayoutBlock.for_text('this is not')
         ]),
         LayoutPage(blocks=[
             LayoutBlock.for_text('this is repetitive'),
             LayoutBlock.for_text('it is different')
         ])
     ])
     feature_values = []
     for features in _iter_line_features(features_provider,
                                         layout_document):
         feature_values.append({
             'get_str_is_repetitive_pattern':
             (features.get_str_is_repetitive_pattern()),
             'get_str_is_first_repetitive_pattern':
             (features.get_str_is_first_repetitive_pattern())
         })
     LOGGER.debug('feature_values: %r', feature_values)
     assert feature_values == [
         {
             'get_str_is_repetitive_pattern': '1',
             'get_str_is_first_repetitive_pattern': '1'
         },
         {
             'get_str_is_repetitive_pattern': '0',
             'get_str_is_first_repetitive_pattern': '0'
         },
         {
             'get_str_is_repetitive_pattern': '1',
             'get_str_is_first_repetitive_pattern': '0'
         },
         {
             'get_str_is_repetitive_pattern': '0',
             'get_str_is_first_repetitive_pattern': '0'
         },
     ]
Example #4
0
    def test_should_extract_acknowledgement_only(
            self, fulltext_models_mock: MockFullTextModels):
        fulltext_processor = FullTextProcessor(fulltext_models_mock)
        acknowledgment_block = LayoutBlock.for_text('Some acknowledgement')

        segmentation_model_mock = fulltext_models_mock.segmentation_model_mock
        fulltext_model_mock = fulltext_models_mock.fulltext_model_mock

        segmentation_model_mock.update_label_by_layout_block(
            acknowledgment_block, '<acknowledgement>')

        fulltext_model_mock.update_label_by_layout_block(
            acknowledgment_block, '<paragraph>')

        layout_document = LayoutDocument(
            pages=[LayoutPage(blocks=[acknowledgment_block])])
        semantic_document = fulltext_processor.get_semantic_document_for_layout_document(
            layout_document=layout_document)
        assert semantic_document is not None
        assert semantic_document.back_section.view_by_section_type(
            SemanticSectionTypes.ACKNOWLEDGEMENT).get_text(
            ) == acknowledgment_block.text
Example #5
0
 def test_should_filter_by_token_multiple_labels(self):
     tagged_tokens = [(TAG_1, get_layout_tokens_for_text('tokens tag 1')),
                      (TAG_2, get_layout_tokens_for_text('tokens tag 2')),
                      (TAG_3, get_layout_tokens_for_text('tokens tag 3'))]
     line = LayoutLine(
         [token for _, tokens in tagged_tokens for token in tokens])
     layout_model_labels = [
         LayoutModelLabel(label=tag,
                          label_token_text=token.text,
                          layout_line=line,
                          layout_token=token)
         for tag, tokens in tagged_tokens for token in tokens
     ]
     layout_document = LayoutDocument(
         pages=[LayoutPage(blocks=[LayoutBlock(lines=[line])])])
     layout_document_label_result = LayoutDocumentLabelResult(
         layout_document, layout_model_labels)
     assert join_layout_tokens(
         layout_document_label_result.get_filtered_document_by_labels([
             TAG_1, TAG_3
         ]).iter_all_tokens()) == join_layout_tokens(tagged_tokens[0][1] +
                                                     tagged_tokens[2][1])
Example #6
0
    def test_should_extract_author_names_separated_by_another_tag(
            self, fulltext_models_mock: MockFullTextModels):
        given_name_block = LayoutBlock.for_text('Given name')
        surname_block = LayoutBlock.for_text('Surname')
        other_block = LayoutBlock.for_text('Other')
        authors_block = LayoutBlock.merge_blocks(
            [given_name_block, other_block, surname_block])
        fulltext_processor = FullTextProcessor(
            fulltext_models_mock,
            config=FullTextProcessorConfig(merge_raw_authors=True))
        header_block = authors_block

        segmentation_model_mock = fulltext_models_mock.segmentation_model_mock
        header_model_mock = fulltext_models_mock.header_model_mock
        name_header_model_mock = fulltext_models_mock.name_header_model_mock

        segmentation_model_mock.update_label_by_layout_block(
            header_block, '<header>')

        header_model_mock.update_label_by_layout_block(given_name_block,
                                                       '<author>')
        header_model_mock.update_label_by_layout_block(surname_block,
                                                       '<author>')

        name_header_model_mock.update_label_by_layout_block(
            given_name_block, '<forename>')
        name_header_model_mock.update_label_by_layout_block(
            surname_block, '<surname>')

        layout_document = LayoutDocument(
            pages=[LayoutPage(blocks=[header_block])])
        semantic_document = fulltext_processor.get_semantic_document_for_layout_document(
            layout_document=layout_document)
        assert semantic_document is not None
        authors = semantic_document.front.authors
        assert len(authors) == 1
        assert authors[0].given_name_text == given_name_block.text
        assert authors[0].surname_text == surname_block.text
Example #7
0
 def iter_filter_layout_document(
         self, layout_document: LayoutDocument) -> Iterable[LayoutDocument]:
     header_layout_document = self.filter_layout_document_by_segmentation_label(
         layout_document, '<header>')
     labeled_layout_tokens = self.header_model.predict_labels_for_layout_document(
         header_layout_document,
         app_features_context=self.app_features_context)
     LOGGER.debug('labeled_layout_tokens: %r', labeled_layout_tokens)
     semantic_raw_aff_address_list = list(
         SemanticMixedContentWrapper(
             list(
                 self.header_model.
                 iter_semantic_content_for_labeled_layout_tokens(
                     labeled_layout_tokens))).iter_by_type(
                         SemanticRawAffiliationAddress))
     LOGGER.info('semantic_raw_aff_address_list count: %d',
                 len(semantic_raw_aff_address_list))
     return [
         LayoutDocument.for_blocks(
             list(semantic_raw_aff_address.iter_blocks())).
         remove_empty_blocks()
         for semantic_raw_aff_address in semantic_raw_aff_address_list
     ]
Example #8
0
    def test_should_extract_raw_references_from_document(  # pylint: disable=too-many-locals
            self, fulltext_models_mock: MockFullTextModels):
        label_block = LayoutBlock.for_text('1')
        ref_text_block = LayoutBlock.for_text('Reference 1')
        ref_block = LayoutBlock.merge_blocks([label_block, ref_text_block])
        fulltext_processor = FullTextProcessor(
            fulltext_models_mock,
            FullTextProcessorConfig(extract_citation_fields=False))

        segmentation_model_mock = fulltext_models_mock.segmentation_model_mock
        reference_segmenter_model_mock = fulltext_models_mock.reference_segmenter_model_mock

        segmentation_model_mock.update_label_by_layout_block(
            ref_block, '<references>')

        reference_segmenter_model_mock.update_label_by_layout_block(
            label_block, '<label>')
        reference_segmenter_model_mock.update_label_by_layout_block(
            ref_text_block, '<reference>')

        layout_document = LayoutDocument(
            pages=[LayoutPage(blocks=[ref_block])])
        semantic_document = fulltext_processor.get_semantic_document_for_layout_document(
            layout_document=layout_document)
        LOGGER.debug('semantic_document: %s', semantic_document)
        assert semantic_document is not None
        assert semantic_document.back_section.get_text() == ref_block.text
        reference_list = list(
            semantic_document.back_section.iter_by_type(SemanticReferenceList))
        assert len(reference_list) == 1
        references = list(reference_list[0].iter_by_type(SemanticRawReference))
        assert len(references) == 1
        ref = references[0]
        assert ref.get_text_by_type(SemanticLabel) == label_block.text
        assert ref.get_text_by_type(
            SemanticRawReferenceText) == ref_text_block.text
        assert ref.content_id == 'b0'
Example #9
0
    def test_should_not_merge_separate_raw_affiliations(  # pylint: disable=too-many-locals
            self, fulltext_models_mock: MockFullTextModels):
        aff_suffix_texts = ['1', '2']
        institution_blocks = [
            LayoutBlock.for_text(f'Institution{t}') for t in aff_suffix_texts
        ]
        aff_blocks = institution_blocks
        aff_address_blocks = aff_blocks
        fulltext_processor = FullTextProcessor(fulltext_models_mock)
        header_block = LayoutBlock.merge_blocks(aff_address_blocks)

        segmentation_model_mock = fulltext_models_mock.segmentation_model_mock
        header_model_mock = fulltext_models_mock.header_model_mock
        affiliation_address_model_mock = fulltext_models_mock.affiliation_address_model_mock

        segmentation_model_mock.update_label_by_layout_block(
            header_block, '<header>')

        for aff_block in aff_blocks:
            header_model_mock.update_label_by_layout_block(
                aff_block, '<affiliation>')

        for institution_block in institution_blocks:
            affiliation_address_model_mock.update_label_by_layout_block(
                institution_block, '<institution>')

        layout_document = LayoutDocument(
            pages=[LayoutPage(blocks=[header_block])])
        semantic_document = fulltext_processor.get_semantic_document_for_layout_document(
            layout_document=layout_document)
        assert semantic_document is not None
        affiliations = list(
            semantic_document.front.iter_by_type(SemanticAffiliationAddress))
        LOGGER.debug('affiliations: %r', affiliations)
        assert ([aff.get_text() for aff in affiliations
                 ] == [aff_block.text for aff_block in aff_blocks])
        assert ([aff.content_id for aff in affiliations] == ['aff0', 'aff1'])
Example #10
0
def get_layout_document_with_text_or_graphic_replaced_by_graphics(
        layout_document: LayoutDocument,
        semantic_graphics: Iterable[SemanticGraphic],
        is_replace_overlapping_text: bool) -> LayoutDocument:
    page_by_page_number = {
        page.meta.page_number: page
        for page in layout_document.pages if page.meta
    }
    LOGGER.debug('page_by_page_number.keys: %r', page_by_page_number.keys())
    has_changes = False
    semantic_graphics_list = list(semantic_graphics)
    semantic_graphic_count_by_page = Counter(
        (semantic_graphic.layout_graphic.coordinates.page_number
         for semantic_graphic in semantic_graphics_list
         if (semantic_graphic.layout_graphic
             and semantic_graphic.layout_graphic.coordinates)))
    for semantic_graphic in semantic_graphics_list:
        layout_graphic = semantic_graphic.layout_graphic
        assert layout_graphic
        if not layout_graphic.coordinates:
            continue
        page_number = layout_graphic.coordinates.page_number
        page_by_page_number[page_number] = (
            get_layout_page_with_text_or_graphic_replaced_by_graphic(
                page_by_page_number[page_number],
                semantic_graphic,
                is_only_semantic_graphic_on_page=(
                    semantic_graphic_count_by_page[page_number] < 2),
                is_replace_overlapping_text=is_replace_overlapping_text))
        has_changes = True
    if not has_changes:
        return layout_document
    pages = [
        (page_by_page_number[page.meta.page_number] if page.meta else page)
        for page in layout_document.pages
    ]
    return layout_document.replace(pages=pages)
Example #11
0
 def _process_raw_affiliations(self, semantic_document: SemanticDocument):
     result_content: List[SemanticContentWrapper] = []
     raw_aff_address_list: List[SemanticRawAffiliationAddress] = []
     for semantic_content in semantic_document.front:
         if isinstance(semantic_content, SemanticRawAffiliationAddress):
             raw_aff_address_list.append(semantic_content)
             continue
         result_content.append(semantic_content)
     if raw_aff_address_list:
         raw_aff_layout_documents = [
             LayoutDocument.for_blocks(list(raw_aff_or_address.iter_blocks()))
             for raw_aff_or_address in raw_aff_address_list
         ]
         labeled_layout_tokens_list = (
             self.affiliation_address_model
             .predict_labels_for_layout_documents(
                 raw_aff_layout_documents,
                 app_features_context=self.app_features_context
             )
         )
         LOGGER.debug('labeled_layout_tokens_list (aff): %r', labeled_layout_tokens_list)
         aff_iterable = (
             aff
             for labeled_layout_tokens in labeled_layout_tokens_list
             for aff in (
                 self.affiliation_address_model
                 .iter_semantic_content_for_labeled_layout_tokens(labeled_layout_tokens)
             )
         )
         for aff in aff_iterable:
             result_content.append(aff)
     semantic_document.front.mixed_content = result_content
     self._assign_content_ids(
         semantic_document.front.iter_by_type(SemanticAffiliationAddress),
         iter(iter_ids('aff'))
     )
Example #12
0
 def test_should_provide_punctuation_profile(
         self, features_provider: SegmentationLineFeaturesProvider):
     layout_document = LayoutDocument(pages=[
         LayoutPage(
             blocks=[LayoutBlock(lines=[
                 LayoutLine.for_text('a .: b'),
             ])])
     ])
     feature_values = []
     for features in _iter_line_features(features_provider,
                                         layout_document):
         feature_values.append({
             'line_punctuation_profile':
             features.get_line_punctuation_profile(),
             'line_punctuation_profile_length_feature':
             (features.get_line_punctuation_profile_length_feature()),
         })
     LOGGER.debug('feature_values: %r', feature_values)
     assert feature_values == [
         {
             'line_punctuation_profile': '.:',
             'line_punctuation_profile_length_feature': '2'
         },
     ]
Example #13
0
    def test_should_extract_author_names_from_document(
            self, fulltext_models_mock: MockFullTextModels):
        given_name_block = LayoutBlock.for_text('Given name')
        surname_block = LayoutBlock.for_text('Surname')
        authors_block = LayoutBlock.merge_blocks(
            [given_name_block, surname_block])
        fulltext_processor = FullTextProcessor(fulltext_models_mock)
        header_block = authors_block

        segmentation_model_mock = fulltext_models_mock.segmentation_model_mock
        header_model_mock = fulltext_models_mock.header_model_mock
        name_header_model_mock = fulltext_models_mock.name_header_model_mock

        segmentation_model_mock.update_label_by_layout_block(
            header_block, '<header>')

        header_model_mock.update_label_by_layout_block(authors_block,
                                                       '<author>')

        name_header_model_mock.update_label_by_layout_block(
            given_name_block, '<forename>')
        name_header_model_mock.update_label_by_layout_block(
            surname_block, '<surname>')

        layout_document = LayoutDocument(
            pages=[LayoutPage(blocks=[header_block])])
        semantic_document = fulltext_processor.get_semantic_document_for_layout_document(
            layout_document=layout_document)
        assert semantic_document is not None
        assert semantic_document.front.get_text() == authors_block.text
        assert (semantic_document.front.view_by_type(
            SemanticAuthor).get_text()) == authors_block.text
        authors = semantic_document.front.authors
        assert len(authors) == 1
        assert authors[0].given_name_text == given_name_block.text
        assert authors[0].surname_text == surname_block.text
Example #14
0
 def iter_filter_layout_document(
         self, layout_document: LayoutDocument) -> Iterable[LayoutDocument]:
     references_layout_document = self.filter_layout_document_by_segmentation_label(
         layout_document, '<references>')
     labeled_layout_tokens = self.reference_segmenter_model.predict_labels_for_layout_document(
         references_layout_document,
         app_features_context=self.app_features_context)
     LOGGER.debug('labeled_layout_tokens: %r', labeled_layout_tokens)
     semantic_raw_references = list(
         SemanticMixedContentWrapper(
             list(
                 self.reference_segmenter_model.
                 iter_semantic_content_for_labeled_layout_tokens(
                     labeled_layout_tokens))).iter_by_type(
                         SemanticRawReference))
     LOGGER.info('semantic_raw_references count: %d',
                 len(semantic_raw_references))
     return [
         LayoutDocument.for_blocks([
             semantic_raw_reference.view_by_type(
                 SemanticRawReferenceText).merged_block
         ]).remove_empty_blocks()
         for semantic_raw_reference in semantic_raw_references
     ]
 def test_should_provide_empty_list_for_empty_document(self):
     layout_document = LayoutDocument(pages=[])
     result = get_page_numbers_with_mostly_bitmap_graphics(layout_document)
     assert result == []
 def test_should_provide_empty_list_for_empty_document(self):
     layout_document = LayoutDocument(pages=[])
     result = get_page_numbers_with_uncommon_page_dimension(layout_document)
     assert result == []
def _get_layout_document_for_layout_graphic(
        layout_graphic: LayoutGraphic) -> LayoutDocument:
    return LayoutDocument(
        pages=[LayoutPage(blocks=[], graphics=[layout_graphic])])
 def test_should_not_change_layout_document_if_semantic_graphics_is_empty(
         self):
     layout_document = LayoutDocument(pages=[])
     result = get_layout_document_with_graphics_replaced_by_graphics(
         layout_document, semantic_graphics=[])
     assert result == layout_document
Example #19
0
    def test_should_extract_figure_label_caption_from_body(  # pylint: disable=too-many-locals
            self, fulltext_models_mock: MockFullTextModels,
            segmentation_label: str):
        citation_block = LayoutBlock.for_text('Figure 1')
        _coordinates = LayoutPageCoordinates(x=10, y=10, width=100, height=10)
        graphic_local_file_path = '/path/to/graphic1.svg'
        graphic = LayoutGraphic(coordinates=_coordinates,
                                local_file_path=graphic_local_file_path)
        _coordinates = _coordinates.move_by(dy=10)
        label_block = LayoutBlock.for_text('Figure 1',
                                           coordinates=_coordinates)
        _coordinates = _coordinates.move_by(dy=10)
        caption_block = LayoutBlock.for_text('Caption 1',
                                             coordinates=_coordinates)
        other_block = LayoutBlock.for_text('Other')
        figure_block = LayoutBlock.merge_blocks(
            [label_block, other_block, caption_block])
        fulltext_block = LayoutBlock.merge_blocks(
            [citation_block, figure_block])
        fulltext_processor = FullTextProcessor(
            fulltext_models_mock,
            FullTextProcessorConfig(extract_figure_fields=True,
                                    extract_graphic_bounding_boxes=True,
                                    extract_graphic_assets=True))

        segmentation_model_mock = fulltext_models_mock.segmentation_model_mock
        fulltext_model_mock = fulltext_models_mock.fulltext_model_mock
        figure_model_mock = fulltext_models_mock.figure_model_mock

        segmentation_model_mock.update_label_by_layout_block(
            fulltext_block, segmentation_label)

        fulltext_model_mock.update_label_by_layout_block(
            citation_block, '<figure_marker>')
        fulltext_model_mock.update_label_by_layout_block(
            figure_block, '<figure>')

        figure_model_mock.update_label_by_layout_block(label_block, '<label>')
        figure_model_mock.update_label_by_layout_block(caption_block,
                                                       '<figDesc>')

        layout_document = LayoutDocument(
            pages=[LayoutPage(blocks=[fulltext_block], graphics=[graphic])])
        semantic_document = fulltext_processor.get_semantic_document_for_layout_document(
            layout_document=layout_document)
        LOGGER.debug('semantic_document: %s', semantic_document)
        assert semantic_document is not None
        figure_list = list(
            iter_by_semantic_type_recursively([
                semantic_document.body_section, semantic_document.back_section
            ], SemanticFigure))
        assert len(figure_list) == 1
        figure = figure_list[0]
        assert figure.get_text_by_type(SemanticLabel) == label_block.text
        assert figure.get_text_by_type(SemanticCaption) == caption_block.text
        assert figure.content_id == 'fig_0'
        figure_citation_list = list(
            semantic_document.iter_by_type_recursively(SemanticFigureCitation))
        assert len(figure_citation_list) == 1
        assert figure_citation_list[0].get_text() == citation_block.text
        assert figure_citation_list[0].target_content_id == 'fig_0'
        semantic_graphic_list = list(figure.iter_by_type(SemanticGraphic))
        assert semantic_graphic_list
        assert semantic_graphic_list[0].layout_graphic == graphic
        assert semantic_graphic_list[0].relative_path == os.path.basename(
            graphic_local_file_path)
Example #20
0
def normalize_layout_document(layout_document: LayoutDocument,
                              **kwargs) -> LayoutDocument:
    return (layout_document.retokenize(
        tokenize_fn=normalize_and_tokenize_text).remove_empty_blocks(**kwargs))
Example #21
0
 def iter_semantic_graphic_for_layout_document(
         self, layout_document: LayoutDocument,
         extract_graphic_assets: bool) -> Iterable[SemanticGraphic]:
     return get_semantic_graphic_list_for_layout_graphic_list(
         layout_document.iter_all_graphics(),
         extract_graphic_assets=extract_graphic_assets)
Example #22
0
 def iter_line_features(  # pylint: disable=too-many-locals
         self, layout_document: LayoutDocument
 ) -> Iterable[SegmentationLineFeatures]:
     segmentation_line_features = SegmentationLineFeatures(
         document_features_context=self.document_features_context)
     previous_token: Optional[LayoutToken] = None
     segmentation_line_features.document_token_count = sum(
         len(line.tokens) for block in layout_document.iter_all_blocks()
         for line in block.lines)
     pattern_candididate_block_iterable = (
         block for page in layout_document.pages
         for block_index, block in enumerate(page.blocks)
         if block_index < 2 or block_index > len(page.blocks) - 2)
     pattern_candididate_line_iterable = (
         block.lines[0] for block in pattern_candididate_block_iterable
         if block.lines and block.lines[0].tokens)
     all_pattern_by_line_id = {
         id(line): get_text_pattern(line.text)
         for line in pattern_candididate_line_iterable
     }
     LOGGER.debug('all_pattern_by_line_id: %s', all_pattern_by_line_id)
     pattern_by_line_id = {
         key: value
         for key, value in all_pattern_by_line_id.items() if len(value) >=
         8  # Java GROBID sometimes counts an additional trailing space
     }
     pattern_counter = Counter(pattern_by_line_id.values())
     LOGGER.debug('pattern_counter: %s', pattern_counter)
     seen_repetitive_patterns: Set[str] = set()
     document_token_index = 0
     for page in layout_document.pages:
         blocks = page.blocks
         segmentation_line_features.page_blocks = blocks
         for block_index, block in enumerate(blocks):
             segmentation_line_features.page_block_index = block_index
             block_lines = block.lines
             segmentation_line_features.block_lines = block_lines
             block_line_texts = [line.text for line in block_lines]
             max_block_line_text_length = max(
                 len(text) for text in block_line_texts)
             first_block_token = next(iter(block.iter_all_tokens()), None)
             assert first_block_token
             for line_index, line in enumerate(block_lines):
                 segmentation_line_features.document_token_index = document_token_index
                 document_token_index += len(line.tokens)
                 segmentation_line_features.layout_line = line
                 segmentation_line_features.block_line_index = line_index
                 segmentation_line_features.max_block_line_text_length = (
                     max_block_line_text_length)
                 line_text = block_line_texts[line_index]
                 retokenized_token_texts = re.split(r" |\t|\f|\u00A0",
                                                    line_text)
                 if not retokenized_token_texts:
                     continue
                 if self.use_first_token_of_block:
                     # Java GROBID uses the first token in the block
                     token = first_block_token
                 else:
                     token = line.tokens[0]
                 segmentation_line_features.layout_token = token
                 segmentation_line_features.line_text = line_text
                 segmentation_line_features.concatenated_line_tokens_text = line_text
                 segmentation_line_features.token_text = retokenized_token_texts[
                     0].strip()
                 segmentation_line_features.second_token_text = (
                     retokenized_token_texts[1]
                     if len(retokenized_token_texts) >= 2 else '')
                 segmentation_line_features.previous_layout_token = previous_token
                 line_pattern = pattern_by_line_id.get(id(line), '')
                 LOGGER.debug('line_pattern: %r', line_pattern)
                 segmentation_line_features.is_repetitive_pattern = (
                     pattern_counter[line_pattern] > 1)
                 segmentation_line_features.is_first_repetitive_pattern = (
                     segmentation_line_features.is_repetitive_pattern
                     and line_pattern not in seen_repetitive_patterns)
                 if segmentation_line_features.is_first_repetitive_pattern:
                     seen_repetitive_patterns.add(line_pattern)
                 yield segmentation_line_features
                 previous_token = token