def test_should_extract_single_annotated_line(self):
     lines = [annotated_line(TEXT_1, TAG_1)]
     structured_document = SimpleStructuredDocument(lines=lines)
     result = [(x.tag, x.text)
               for x in extract_from_annotated_document(structured_document)
               ]
     assert result == [(TAG_1, TEXT_1)]
 def test_should_extract_sub_tags_from_single_item(self):
     tokens = [
         _token_with_sub_tag(VALUE_1,
                             tag=TAG_1,
                             tag_prefix=B_TAG_PREFIX,
                             sub_tag=TAG_2,
                             sub_tag_prefix=B_TAG_PREFIX),
         _token_with_sub_tag(VALUE_2,
                             tag=TAG_1,
                             tag_prefix=I_TAG_PREFIX,
                             sub_tag=TAG_2,
                             sub_tag_prefix=I_TAG_PREFIX),
         _token_with_sub_tag(VALUE_3,
                             tag=TAG_1,
                             tag_prefix=I_TAG_PREFIX,
                             sub_tag=TAG_3,
                             sub_tag_prefix=B_TAG_PREFIX)
     ]
     structured_document = SimpleStructuredDocument(
         lines=[SimpleLine(tokens)])
     extracted_items = list(
         extract_from_annotated_document(structured_document))
     result = [(x.tag, x.text, [(sub.tag, sub.text) for sub in x.sub_items])
               for x in extracted_items]
     get_logger().debug('result: %s', result)
     assert result == [(TAG_1, ' '.join([VALUE_1, VALUE_2, VALUE_3]),
                        [(TAG_2, ' '.join([VALUE_1, VALUE_2])),
                         (TAG_3, VALUE_3)])]
 def test_should_extract_from_different_tag_scope(self):
     lines = [
         SimpleLine([SimpleToken(TEXT_1, tag=TAG_1, tag_scope=TAG_SCOPE_1)])
     ]
     structured_document = SimpleStructuredDocument(lines=lines)
     result = [(x.tag, x.text) for x in extract_from_annotated_document(
         structured_document, tag_scope=TAG_SCOPE_1)]
     assert result == [(TAG_1, TEXT_1)]
 def test_should_combine_multiple_lines(self):
     lines = [annotated_line(TEXT_1, TAG_1), annotated_line(TEXT_2, TAG_1)]
     structured_document = SimpleStructuredDocument(lines=lines)
     result = [(x.tag, x.text)
               for x in extract_from_annotated_document(structured_document)
               ]
     get_logger().debug('result: %s', result)
     assert result == [(TAG_1, '\n'.join([TEXT_1, TEXT_2]))]
 def test_should_extract_multiple_annotations_on_single_line(self):
     lines = [
         to_line(
             annotated_tokens(TEXT_1, TAG_1) + to_tokens(TEXT_2) +
             annotated_tokens(TEXT_3, TAG_3))
     ]
     structured_document = SimpleStructuredDocument(lines=lines)
     result = [(x.tag, x.text)
               for x in extract_from_annotated_document(structured_document)
               ]
     assert result == [(TAG_1, TEXT_1), (None, TEXT_2), (TAG_3, TEXT_3)]
 def test_should_separate_items_based_on_tag_prefix(self):
     tokens = [
         SimpleToken(VALUE_1, tag=TAG_1, tag_prefix=B_TAG_PREFIX),
         SimpleToken(VALUE_2, tag=TAG_1, tag_prefix=I_TAG_PREFIX),
         SimpleToken(VALUE_3, tag=TAG_1, tag_prefix=I_TAG_PREFIX),
         SimpleToken(VALUE_1, tag=TAG_1, tag_prefix=B_TAG_PREFIX),
         SimpleToken(VALUE_2, tag=TAG_1, tag_prefix=I_TAG_PREFIX),
         SimpleToken(VALUE_3, tag=TAG_1, tag_prefix=I_TAG_PREFIX)
     ]
     structured_document = SimpleStructuredDocument(
         lines=[SimpleLine(tokens)])
     result = [(x.tag, x.text)
               for x in extract_from_annotated_document(structured_document)
               ]
     get_logger().debug('result: %s', result)
     assert result == [(TAG_1, ' '.join([VALUE_1, VALUE_2, VALUE_3])),
                       (TAG_1, ' '.join([VALUE_1, VALUE_2, VALUE_3]))]
 def test_should_not_fail_on_empty_document(self):
     structured_document = SimpleStructuredDocument()
     extract_from_annotated_document(structured_document)
Ejemplo n.º 8
0
def extract_structured_document_to_xml(structured_document, tag_scope=None):
    return extracted_items_to_xml(
        extract_from_annotated_document(structured_document,
                                        tag_scope=tag_scope))