def iter_model_data_with_reset_instruction_iterable(
    model_data_or_instruction_iterable: Iterable[Union[LayoutModelData,
                                                       ExtractInstruction]]
) -> Iterable[Union[LayoutModelData, ExtractInstruction]]:
    # using extractor to re-use logic to split author names
    # here we will split on the first token of the extracted semantic content
    extractor = NameSemanticExtractor()
    model_data_or_instruction_list = list(model_data_or_instruction_iterable)
    entity_tokens = iter_entity_layout_blocks_for_labeled_layout_tokens([
        LabeledLayoutToken(label=get_model_data_label(model_data) or '',
                           layout_token=model_data.layout_token)
        for model_data in model_data_or_instruction_list
        if (isinstance(model_data, LayoutModelData)
            and model_data.layout_token is not None)
    ])
    LOGGER.debug('entity_tokens: %r', entity_tokens)
    reset_token_ids: Set[int] = set()
    for index, semantic_content in enumerate(
            extractor.iter_semantic_content_for_entity_blocks(
                entity_tokens=entity_tokens, name_type=SemanticAuthor)):
        if index == 0:
            continue
        for semantic_token in semantic_content.iter_tokens():
            reset_token_ids.add(id(semantic_token))
            break
    for model_data_or_instruction in model_data_or_instruction_list:
        if isinstance(model_data_or_instruction, LayoutModelData):
            model_data = model_data_or_instruction
            if id(model_data.layout_token) in reset_token_ids:
                yield ResetExtractInstruction(
                    ROOT_TRAINING_XML_ELEMENT_PATH[:-1])
        yield model_data_or_instruction
 def test_should_parse_multiple_markers(self):
     semantic_content_list = list(
         NameSemanticExtractor().iter_semantic_content_for_entity_blocks([
             ('<forename>', LayoutBlock.for_text('John')),
             ('<surname>', LayoutBlock.for_text('Smith')),
             ('<marker>', LayoutBlock.for_text('1')),
             ('O', LayoutBlock.for_text(',')),
             ('<marker>', LayoutBlock.for_text('2')),
             ('O', LayoutBlock.for_text(',')),
             ('<forename>', LayoutBlock.for_text('Maria')),
             ('<surname>', LayoutBlock.for_text('Madison')),
             ('<marker>', LayoutBlock.for_text('2')),
             ('O', LayoutBlock.for_text(',')),
             ('<marker>', LayoutBlock.for_text('3')),
         ]))
     LOGGER.debug('semantic_content_list: %s', semantic_content_list)
     assert len(semantic_content_list) == 2
     author_1 = semantic_content_list[0]
     assert isinstance(author_1, SemanticAuthor)
     assert author_1.given_name_text == 'John'
     assert author_1.surname_text == 'Smith'
     assert author_1.view_by_type(SemanticMarker).get_text_list() == [
         '1', '2'
     ]
     author_2 = semantic_content_list[1]
     assert isinstance(author_2, SemanticAuthor)
     assert author_2.given_name_text == 'Maria'
     assert author_2.surname_text == 'Madison'
     assert author_2.view_by_type(SemanticMarker).get_text_list() == [
         '2', '3'
     ]
 def test_should_split_on_double_comma_before_marker(self):
     # The model currently does not provide segmentation as such
     # That is why the segmentation of authors is currently rule-based
     semantic_content_list = list(
         NameSemanticExtractor().iter_semantic_content_for_entity_blocks([
             ('<marker>', LayoutBlock.for_text('1')),
             ('<forename>', LayoutBlock.for_text('John')),
             ('<surname>', LayoutBlock.for_text('Smith')),
             ('O', LayoutBlock.for_text(', ,')),
             ('<marker>', LayoutBlock.for_text('2')),
             ('<forename>', LayoutBlock.for_text('Maria')),
             ('<surname>', LayoutBlock.for_text('Madison')),
         ]))
     LOGGER.debug('semantic_content_list: %s', semantic_content_list)
     assert len(semantic_content_list) == 2
     author_1 = semantic_content_list[0]
     assert isinstance(author_1, SemanticAuthor)
     assert author_1.given_name_text == 'John'
     assert author_1.surname_text == 'Smith'
     assert author_1.view_by_type(SemanticMarker).get_text() == '1'
     author_2 = semantic_content_list[1]
     assert isinstance(author_2, SemanticAuthor)
     assert author_2.given_name_text == 'Maria'
     assert author_2.surname_text == 'Madison'
     assert author_2.view_by_type(SemanticMarker).get_text() == '2'
 def test_should_consider_names_without_last_name_as_invalid(self):
     semantic_content_list = list(
         NameSemanticExtractor().iter_semantic_content_for_entity_blocks(
             [('<forename>', LayoutBlock.for_text('J'))],
             name_type=SemanticAuthor))
     assert len(semantic_content_list) == 1
     author = semantic_content_list[0]
     assert isinstance(author, SemanticNote)
     assert author.get_text() == 'J'
 def test_should_be_able_to_extract_single_editor(self):
     semantic_content_list = list(
         NameSemanticExtractor().iter_semantic_content_for_entity_blocks(
             [('<forename>', LayoutBlock.for_text('John')),
              ('<surname>', LayoutBlock.for_text('Smith'))],
             name_type=SemanticEditor))
     assert len(semantic_content_list) == 1
     author = semantic_content_list[0]
     assert isinstance(author, SemanticEditor)
     assert author.given_name_text == 'John'
     assert author.surname_text == 'Smith'
 def test_should_extract_single_author(self):
     semantic_content_list = list(
         NameSemanticExtractor().iter_semantic_content_for_entity_blocks(
             [('<title>', LayoutBlock.for_text('Mr')),
              ('<forename>', LayoutBlock.for_text('John')),
              ('<middlename>', LayoutBlock.for_text('M')),
              ('<surname>', LayoutBlock.for_text('Smith')),
              ('<suffix>', LayoutBlock.for_text('Suffix1')),
              ('<marker>', LayoutBlock.for_text('1'))],
             name_type=SemanticAuthor))
     assert len(semantic_content_list) == 1
     author = semantic_content_list[0]
     assert isinstance(author, SemanticAuthor)
     assert author.view_by_type(SemanticNameTitle).get_text() == 'Mr'
     assert author.given_name_text == 'John'
     assert author.view_by_type(SemanticMiddleName).get_text() == 'M'
     assert author.surname_text == 'Smith'
     assert author.view_by_type(SemanticNameSuffix).get_text() == 'Suffix1'
     assert author.view_by_type(SemanticMarker).get_text() == '1'
 def test_should_split_on_second_surname(self):
     # This matches the current rules in GROBID
     semantic_content_list = list(
         NameSemanticExtractor().iter_semantic_content_for_entity_blocks([
             ('<surname>', LayoutBlock.for_text('Smith')),
             ('<forename>', LayoutBlock.for_text('John')),
             ('<surname>', LayoutBlock.for_text('Madison')),
             ('<forename>', LayoutBlock.for_text('Maria')),
         ]))
     LOGGER.debug('semantic_content_list: %s', semantic_content_list)
     assert len(semantic_content_list) == 2
     author_1 = semantic_content_list[0]
     assert isinstance(author_1, SemanticAuthor)
     assert author_1.given_name_text == 'John'
     assert author_1.surname_text == 'Smith'
     author_2 = semantic_content_list[1]
     assert isinstance(author_2, SemanticAuthor)
     assert author_2.given_name_text == 'Maria'
     assert author_2.surname_text == 'Madison'
 def test_should_treat_two_letter_uppercase_given_name_as_given_midde_name(
         self):
     semantic_content_list = list(
         NameSemanticExtractor().iter_semantic_content_for_entity_blocks(
             [
                 ('<forename>', LayoutBlock.for_text('JM')),
                 ('<surname>', LayoutBlock.for_text('SMITH')),
             ],
             name_type=SemanticAuthor))
     assert len(semantic_content_list) == 1
     author = semantic_content_list[0]
     assert isinstance(author, SemanticAuthor)
     given_name = list(author.iter_by_type(SemanticGivenName))[0]
     assert given_name.get_text() == 'J'
     assert given_name.value == 'J'
     middle_name = list(author.iter_by_type(SemanticMiddleName))[0]
     assert middle_name.get_text() == 'M'
     assert middle_name.value == 'M'
     surname = list(author.iter_by_type(SemanticSurname))[0]
     assert surname.get_text() == 'SMITH'
     assert surname.value == 'Smith'
 def test_should_convert_name_parts_to_title_case(self):
     semantic_content_list = list(
         NameSemanticExtractor().iter_semantic_content_for_entity_blocks(
             [
                 ('<forename>', LayoutBlock.for_text('JOHN')),
                 ('<middlename>', LayoutBlock.for_text('MIDDLE')),
                 ('<surname>', LayoutBlock.for_text('SMITH')),
             ],
             name_type=SemanticAuthor))
     assert len(semantic_content_list) == 1
     author = semantic_content_list[0]
     assert isinstance(author, SemanticAuthor)
     given_name = list(author.iter_by_type(SemanticGivenName))[0]
     assert given_name.get_text() == 'JOHN'
     assert given_name.value == 'John'
     middle_name = list(author.iter_by_type(SemanticMiddleName))[0]
     assert middle_name.get_text() == 'MIDDLE'
     assert middle_name.value == 'Middle'
     surname = list(author.iter_by_type(SemanticSurname))[0]
     assert surname.get_text() == 'SMITH'
     assert surname.value == 'Smith'
Example #10
0
 def get_semantic_extractor(self) -> NameSemanticExtractor:
     return NameSemanticExtractor()