def test_should_annotate_exactly_matching_across_multiple_lines(self): matching_tokens_per_line = [ _tokens_for_text('this is matching'), _tokens_for_text('and continues here') ] matching_tokens = flatten(matching_tokens_per_line) target_annotations = [ TargetAnnotation('this is matching and continues here', TAG1) ] doc = _document_for_tokens(matching_tokens_per_line) MatchingAnnotator(target_annotations).annotate(doc) assert _get_tags_of_tokens( matching_tokens) == [TAG1] * len(matching_tokens)
def test_should_annotate_short_reference_item_followed_by_other_reference_items( self): reference_item_texts = ['ref_id', 'ref_title'] reference_item_tokens = _tokens_for_text( ' '.join(reference_item_texts)) tokens_per_line = [reference_item_tokens] target_annotations = [ TargetAnnotation(reference_item_texts, 'reference', bonding=True) ] doc = _document_for_tokens(tokens_per_line) MatchingAnnotator(target_annotations).annotate(doc) assert (_get_tags_of_tokens(reference_item_tokens) == ['reference'] * len(reference_item_tokens))
def test_should_annotate_sub_tag_exactly_matching_without_begin_prefix( self): matching_tokens = _tokens_for_text('this is matching') target_annotations = [ TargetAnnotation('this is matching', TAG2, sub_annotations=[TargetAnnotation('this', TAG1)]) ] doc = _document_for_tokens([matching_tokens]) MatchingAnnotator(target_annotations, use_tag_begin_prefix=False).annotate(doc) assert [doc.get_sub_tag(x) for x in matching_tokens] == [TAG1, None, None]
def test_should_annotate_same_sequence_multiple_times_if_enabled(self): matching_tokens_per_line = [ _tokens_for_text('this is matching'), _tokens_for_text('this is matching') ] matching_tokens = flatten(matching_tokens_per_line) target_annotations = [ TargetAnnotation('this is matching', TAG1, match_multiple=True) ] doc = _document_for_tokens(matching_tokens_per_line) MatchingAnnotator(target_annotations).annotate(doc) assert _get_tags_of_tokens( matching_tokens) == [TAG1] * len(matching_tokens)
def test_should_not_annotate_pre_annotated_tokens_on_separate_lines(self): line_no_tokens = _tokens_for_text('1') line_no_tokens[0].set_tag('line_no') matching_tokens = _tokens_for_text('this is matching') target_annotations = [ TargetAnnotation('1', TAG2), TargetAnnotation('this is matching', TAG1) ] doc = _document_for_tokens([line_no_tokens + matching_tokens]) MatchingAnnotator(target_annotations).annotate(doc) assert _get_tags_of_tokens(line_no_tokens) == ['line_no' ] * len(line_no_tokens) assert _get_tags_of_tokens( matching_tokens) == [TAG1] * len(matching_tokens)
def test_should_not_annotate_shorter_target_annotation_in_longer_line_multiple_times( self): pre_tokens = _tokens_for_text('pre') matching_tokens = _tokens_for_text('this is matching') post_tokens = _tokens_for_text('post') first_line_tokens = pre_tokens + matching_tokens + post_tokens similar_line_tokens = _copy_tokens(first_line_tokens) target_annotations = [TargetAnnotation('this is matching', TAG1)] doc = _document_for_tokens([first_line_tokens, similar_line_tokens]) MatchingAnnotator(target_annotations).annotate(doc) assert _get_tags_of_tokens( matching_tokens) == [TAG1] * len(matching_tokens) assert _get_tags_of_tokens( similar_line_tokens) == [None] * len(similar_line_tokens)
def test_should_annotate_mult_value_target_annot_rev_order_over_mult_lines_with_b_prefix( self): tokens_by_line = [ _tokens_for_text('this may'), _tokens_for_text('match') ] matching_tokens = flatten(tokens_by_line) target_annotations = [ TargetAnnotation(list(reversed(['this', 'may', 'match'])), TAG1) ] doc = _document_for_tokens(tokens_by_line) MatchingAnnotator(target_annotations, use_tag_begin_prefix=True).annotate(doc) assert _get_tags_of_tokens(matching_tokens) == ( [B_TAG_1] + [I_TAG_1] * (len(matching_tokens) - 1))
def test_should_annotate_exactly_matching_across_multiple_lines_with_begin_prefix( self): matching_tokens_per_line = [ _tokens_for_text('this is matching'), _tokens_for_text('and continues here') ] matching_tokens = flatten(matching_tokens_per_line) target_annotations = [ TargetAnnotation('this is matching and continues here', TAG1) ] doc = _document_for_tokens(matching_tokens_per_line) MatchingAnnotator(target_annotations, use_tag_begin_prefix=True).annotate(doc) assert _get_tags_of_tokens(matching_tokens) == ( [B_TAG_1] + [I_TAG_1] * (len(matching_tokens) - 1))
def test_should_not_annotate_short_reference_item_not_followed_by_other_reference_items( self): matching_reference_item_text = 'ref_id' reference_item_texts = [matching_reference_item_text] + ['ref_title'] matching_reference_item_tokens = _tokens_for_text( matching_reference_item_text) other_tokens = _tokens_for_text('other') tokens_per_line = [matching_reference_item_tokens + other_tokens] target_annotations = [ TargetAnnotation(reference_item_texts, 'reference', bonding=True) ] doc = _document_for_tokens(tokens_per_line) MatchingAnnotator(target_annotations).annotate(doc) assert (_get_tags_of_tokens(matching_reference_item_tokens) == [None] * len(matching_reference_item_tokens))
def test_should_not_annotate_short_section_title_if_paragraph_follows_later( self): section_title_text = 'section title' section_title_tokens = _tokens_for_text(section_title_text + '.') other_tokens = _tokens_for_text('other text to come here.') tokens_per_line = [section_title_tokens + other_tokens] target_annotations = [ TargetAnnotation(section_title_text, 'section_title', require_next=True) ] doc = _document_for_tokens(tokens_per_line) MatchingAnnotator(target_annotations).annotate(doc) assert (_get_tags_of_tokens(section_title_tokens) == [None] * len(section_title_tokens))
def test_should_annotate_fuzzily_matching_longer_matches_based_on_ratio( self): long_matching_text = 'this is matching and is really really long match that we can trust' matching_tokens = _tokens_for_text(long_matching_text) no_matching_tokens = _tokens_for_text('what comes next is different') target_annotations = [ TargetAnnotation( long_matching_text + ' but this is not and is another matter', TAG1) ] doc = _document_for_tokens([matching_tokens + no_matching_tokens]) MatchingAnnotator(target_annotations).annotate(doc) assert _get_tags_of_tokens( matching_tokens) == [TAG1] * len(matching_tokens) assert _get_tags_of_tokens( no_matching_tokens) == [None] * len(no_matching_tokens)
def test_should_annotate_not_match_distant_value_of_multiple_value_target_annotation( self): matching_tokens = _tokens_for_text('this may match') distant_matching_tokens = _tokens_for_text('not') distance_in_lines = 10 tokens_by_line = [matching_tokens] + [ _tokens_for_text('other') for _ in range(distance_in_lines) ] + [distant_matching_tokens] target_annotations = [ TargetAnnotation(['this', 'may', 'match', 'not'], TAG1) ] doc = _document_for_tokens(tokens_by_line) MatchingAnnotator(target_annotations).annotate(doc) assert _get_tags_of_tokens( matching_tokens) == [TAG1] * len(matching_tokens) assert _get_tags_of_tokens( distant_matching_tokens) == [None] * len(distant_matching_tokens)
def test_should_annotate_last_line_of_block_followed_by_other_text(self): block_text_lines = [ 'this is the first row', 'second row follows', 'here we are on the third', 'last line of block' ] block_tokens_per_line = _tokens_for_text_lines(block_text_lines) block_tokens = flatten(block_tokens_per_line) tokens_per_line = block_tokens_per_line + [ _tokens_for_text('other text') ] target_annotations = [ TargetAnnotation('\n'.join(block_text_lines), TAG1) ] doc = _document_for_tokens(tokens_per_line) MatchingAnnotator(target_annotations).annotate(doc) assert (_get_tags_of_tokens(block_tokens) == [TAG1] * len(block_tokens))
def test_should_annotate_same_sequence_multiple_times_with_begin_prefix( self): matching_tokens_per_line = [ _tokens_for_text('this is matching'), _tokens_for_text('this is matching') ] matching_tokens = flatten(matching_tokens_per_line) target_annotations = [ TargetAnnotation('this is matching', TAG1, match_multiple=True) ] doc = _document_for_tokens(matching_tokens_per_line) MatchingAnnotator(target_annotations, use_tag_begin_prefix=True).annotate(doc) # the begin tag should appear at the beginning of each match assert (_get_tags_of_tokens(matching_tokens) == [ B_TAG_1, I_TAG_1, I_TAG_1, B_TAG_1, I_TAG_1, I_TAG_1 ])
def test_should_not_annotate_similar_sequence_multiple_times(self): matching_tokens_per_line = [ _tokens_for_text('this is matching'), _tokens_for_text('and continues here') ] not_matching_tokens = _tokens_for_text('this is matching') matching_tokens = flatten(matching_tokens_per_line) target_annotations = [ TargetAnnotation('this is matching and continues here', TAG1) ] doc = _document_for_tokens(matching_tokens_per_line + [not_matching_tokens]) MatchingAnnotator(target_annotations).annotate(doc) assert _get_tags_of_tokens( matching_tokens) == [TAG1] * len(matching_tokens) assert _get_tags_of_tokens( not_matching_tokens) == [None] * len(not_matching_tokens)
def convert_and_annotate_lxml_content(lxml_content, xml_content, xml_mapping, name=None): stop_watch_recorder = StopWatchRecorder() stop_watch_recorder.start('parse lxml') lxml_root = etree.fromstring(lxml_content) # use a more lenient way to parse xml as xml errors are not uncomment stop_watch_recorder.start('parse xml') xml_root = xml_from_string_with_recover(xml_content) stop_watch_recorder.start('extract target annotations') target_annotations = xml_root_to_target_annotations( xml_root, xml_mapping ) stop_watch_recorder.stop() annotators = DEFAULT_ANNOTATORS + [MatchingAnnotator( target_annotations, use_tag_begin_prefix=True )] annotator = Annotator(annotators) stop_watch_recorder.start('convert to svg') svg_roots = list(iter_svg_pages_for_lxml(lxml_root)) stop_watch_recorder.start('annotate svg') annotator.annotate(SvgStructuredDocument(svg_roots)) stop_watch_recorder.start('add visualisation') svg_roots = [ visualize_svg_annotations(svg_root) for svg_root in svg_roots ] stop_watch_recorder.stop() get_logger().info( 'processed: name=%s, lxml size=%s, xml size=%s, timings=[%s] (native align impl=%s)', name, format(len(lxml_content), ','), format(len(xml_content), ','), stop_watch_recorder, align_native_enabled ) return svg_roots
def test_should_annotate_same_sub_annotations_multiple_times_with_begin_prefic( self): matching_tokens_by_line = [ _tokens_for_text('this is matching'), _tokens_for_text('this is matching') ] matching_tokens = flatten(matching_tokens_by_line) target_annotations = [ TargetAnnotation( 'this is matching', TAG2, match_multiple=True, sub_annotations=[TargetAnnotation('this is', TAG1)]) ] doc = _document_for_tokens(matching_tokens_by_line) MatchingAnnotator(target_annotations, use_tag_begin_prefix=True).annotate(doc) assert ([doc.get_sub_tag(x) for x in matching_tokens ] == [B_TAG_1, I_TAG_1, None, B_TAG_1, I_TAG_1, None])
def test_should_annotate_shorter_target_annotation_in_longer_line_multiple_times_if_enabled( self): pre_tokens = _tokens_for_text('pre') matching_tokens = _tokens_for_text('this is matching') post_tokens = _tokens_for_text('post') same_matching_tokens = _copy_tokens(matching_tokens) target_annotations = [ TargetAnnotation('this is matching', TAG1, match_multiple=True) ] doc = _document_for_tokens([ pre_tokens + matching_tokens + post_tokens, _copy_tokens(pre_tokens) + same_matching_tokens + _copy_tokens(post_tokens) ]) MatchingAnnotator(target_annotations).annotate(doc) assert _get_tags_of_tokens( matching_tokens) == [TAG1] * len(matching_tokens) assert _get_tags_of_tokens( same_matching_tokens) == [TAG1] * len(same_matching_tokens)
def test_should_annotate_short_section_title_followed_by_paragraph(self): section_title_text = 'section title' section_paragraph_text = 'paragraph text to come here.' section_title_tokens = _tokens_for_text(section_title_text + '.') section_paragraph_tokens = _tokens_for_text(section_paragraph_text) tokens_per_line = [section_title_tokens + section_paragraph_tokens] target_annotations = [ TargetAnnotation(section_title_text, 'section_title', require_next=True), TargetAnnotation(section_paragraph_text, 'section_paragraph') ] doc = _document_for_tokens(tokens_per_line) MatchingAnnotator(target_annotations).annotate(doc) assert ( _get_tags_of_tokens(section_title_tokens) == ['section_title'] * len(section_title_tokens)) assert (_get_tags_of_tokens(section_paragraph_tokens) == ['section_paragraph'] * len(section_paragraph_tokens))
def test_should_annotate_sub_tag_across_multiple_tokens(self): sub_matching_tokens = _tokens_for_text('this is matching') tag_matching_tokens = (_tokens_for_text('something before') + sub_matching_tokens + _tokens_for_text('more text to come')) all_tokens = (_tokens_for_text('not matching') + tag_matching_tokens + _tokens_for_text('and there')) target_annotations = [ TargetAnnotation(_tokens_to_text(tag_matching_tokens), TAG2, sub_annotations=[ TargetAnnotation( _tokens_to_text(sub_matching_tokens), TAG1) ]) ] doc = _document_for_tokens([all_tokens]) MatchingAnnotator(target_annotations, use_tag_begin_prefix=False).annotate(doc) assert [doc.get_sub_tag(x) for x in sub_matching_tokens] == [TAG1, TAG1, TAG1]
def test_should_annotate_multiple_shorter_target_annotation_in_longer_line( self): pre_tokens = _tokens_for_text('pre') matching_tokens_tag_1 = _tokens_for_text('this is matching') mid_tokens = _tokens_for_text('mid') matching_tokens_tag_2 = _tokens_for_text('also good') post_tokens = _tokens_for_text('post') target_annotations = [ TargetAnnotation('this is matching', TAG1), TargetAnnotation('also good', TAG2) ] doc = _document_for_tokens([ pre_tokens + matching_tokens_tag_1 + mid_tokens + matching_tokens_tag_2 + post_tokens ]) MatchingAnnotator(target_annotations).annotate(doc) assert _get_tags_of_tokens(pre_tokens) == [None] * len(pre_tokens) assert _get_tags_of_tokens( matching_tokens_tag_1) == [TAG1] * len(matching_tokens_tag_1) assert _get_tags_of_tokens(mid_tokens) == [None] * len(mid_tokens) assert _get_tags_of_tokens( matching_tokens_tag_2) == [TAG2] * len(matching_tokens_tag_2) assert _get_tags_of_tokens(post_tokens) == [None] * len(post_tokens)
def test_should_annotate_shorter_sequence_over_multiple_lines_considering_next_line( self): # use a short sequence that wouldn't get matched on it's own matching_text_lines = ('this may', 'match') matching_tokens_by_line = _tokens_for_text_lines(matching_text_lines) matching_tokens = flatten(matching_tokens_by_line) # repeat the same text on the two lines, only by combining the lines would it be clear # which tokens to match pre_tokens = _tokens_for_text(matching_text_lines[0] + ' be some other longer preceeding text') post_tokens = _tokens_for_text('this is some text after but no ' + matching_text_lines[1]) tokens_by_line = [ pre_tokens + matching_tokens_by_line[0], matching_tokens_by_line[1] + post_tokens ] target_annotations = [TargetAnnotation('this may match', TAG1)] doc = _document_for_tokens(tokens_by_line) MatchingAnnotator(target_annotations).annotate(doc) assert _get_tags_of_tokens( matching_tokens) == [TAG1] * len(matching_tokens) assert _get_tags_of_tokens(pre_tokens) == [None] * len(pre_tokens) assert _get_tags_of_tokens(post_tokens) == [None] * len(post_tokens)
def test_should_annotate_over_multiple_lines_with_tag_transition(self): tag1_tokens_by_line = [ _tokens_for_text('this may'), _tokens_for_text('match') ] tag1_tokens = flatten(tag1_tokens_by_line) tag2_tokens_by_line = [ _tokens_for_text('another'), _tokens_for_text('tag here') ] tag2_tokens = flatten(tag2_tokens_by_line) tokens_by_line = [ tag1_tokens_by_line[0], tag1_tokens_by_line[1] + tag2_tokens_by_line[0], tag2_tokens_by_line[1] ] target_annotations = [ TargetAnnotation('this may match', TAG1), TargetAnnotation('another tag here', TAG2) ] doc = _document_for_tokens(tokens_by_line) MatchingAnnotator(target_annotations).annotate(doc) assert _get_tags_of_tokens(tag1_tokens) == [TAG1] * len(tag1_tokens) assert _get_tags_of_tokens(tag2_tokens) == [TAG2] * len(tag2_tokens)
def test_should_not_fail_on_empty_document(self): doc = SimpleStructuredDocument(lines=[]) MatchingAnnotator([]).annotate(doc)
def test_should_not_fail_on_empty_line_with_blank_token(self): target_annotations = [TargetAnnotation('this is. matching', TAG1)] doc = _document_for_tokens([[SimpleToken('')]]) MatchingAnnotator(target_annotations).annotate(doc)