def test_should_tag_single_token_within_partial_prediction_at_smaller_scale(self): token_1 = SimpleToken(TOKEN_TEXT_1) structured_document = SimpleStructuredDocument(lines=[SimpleLine([token_1])]) structured_document.set_bounding_box( structured_document.get_pages()[0], BoundingBox(0, 0, DEFAULT_WIDTH * 100, DEFAULT_HEIGHT * 100) ) structured_document.set_bounding_box( token_1, BoundingBox(0, 0, DEFAULT_WIDTH * 10, DEFAULT_HEIGHT * 10) ) annotated_image = filled_image( BG_COLOR, {TAG_1: COLOR_1}, width=DEFAULT_WIDTH * 10, height=DEFAULT_HEIGHT * 10 ) fill_rect( annotated_image, BoundingBox(0, 0, DEFAULT_WIDTH, DEFAULT_HEIGHT), COLOR_1 ) annotate_structured_document_using_predicted_images( structured_document, [annotated_image] ) assert structured_document.get_tag(token_1, scope=CV_TAG_SCOPE) == TAG_1
def test_should_calculate_bounding_box_of_page_without_xy(self): page = E.PAGE({ 'width': '100', 'height': '101' }) doc = LxmlStructuredDocument(E.DOCUMENT(page)) assert doc.get_bounding_box(page) == BoundingBox(0, 0, 100, 101)
def test_should_be_able_to_set_bounding_box(self): bounding_box = BoundingBox(11, 12, 101, 102) text = SVG_TEXT('a', {'x': '10', 'y': '11', 'font-size': '100'}) doc = SvgStructuredDocument(E.svg(SVG_TEXT_LINE(text))) doc.set_bounding_box(text, bounding_box) assert text.attrib[SVGE_BOUNDING_BOX] == format_bounding_box( bounding_box)
def get_node_bounding_box(t): return BoundingBox( float(t.attrib.get('x', 0)), float(t.attrib.get('y', 0)), float(t.attrib['width']), float(t.attrib['height']) )
def test_should_use_viewbox_if_available(self): bounding_box = BoundingBox(11, 12, 101, 102) page = E.svg({ SVG_VIEWBOX_ATTRIB: format_bounding_box(bounding_box) }) doc = SvgStructuredDocument(page) assert doc.get_bounding_box(page) == bounding_box
def test_should_create_text_node_with_common_attributes(self): lxml_root = E.DOCUMENT( E.PAGE( E.TEXT( E.TOKEN( SOME_TEXT, COMMON_LXML_TOKEN_ATTRIBS ) ) ) ) svg_pages = list(iter_svg_pages_for_lxml(lxml_root)) assert len(svg_pages) == 1 first_page = svg_pages[0] svg_text = first_page.find('.//' + SVG_TEXT) assert svg_text is not None assert svg_text.text == SOME_TEXT assert float(svg_text.attrib[SVG.X]) == float(SOME_X) assert float(svg_text.attrib[SVG.Y]) == float(SOME_Y) assert float(svg_text.attrib[SVG.FONT_SIZE]) == float(SOME_FONT_SIZE) assert svg_text.attrib[SVG.FONT_FAMILY] == SOME_FONT_FAMILY assert svg_text.attrib[SVG.FILL] == SOME_FONT_COLOR assert parse_bounding_box(svg_text.attrib.get(SVG.BOUNDING_BOX)) == BoundingBox( float(COMMON_LXML_TOKEN_ATTRIBS[LXML.X]), float(COMMON_LXML_TOKEN_ATTRIBS[LXML.Y]), float(COMMON_LXML_TOKEN_ATTRIBS[LXML.WIDTH]), float(COMMON_LXML_TOKEN_ATTRIBS[LXML.HEIGHT]) )
def test_should_accept_float_image_size(self): blocks = [AnnotationBlock(TAG1, BoundingBox(0, 0, 1, 1))] image = annotated_blocks_to_image(blocks, color_map={TAG1: (0, 255, 0)}, width=3.1, height=3.9) assert image.size == (4, 4)
def test_should_convert_rect_color_name(self): blocks = [AnnotationBlock(TAG1, BoundingBox(0, 0, 1, 1))] image = annotated_blocks_to_image(blocks, color_map={TAG1: 'green'}, width=3, height=3) assert image.getpixel((0, 0)) == (0, 128, 0, 255)
def test_should_ignore_unmapped_tag(self): blocks = [AnnotationBlock(TAG1, BoundingBox(0, 0, 1, 1))] image = annotated_blocks_to_image(blocks, color_map={}, width=3, height=3) assert image.getpixel((0, 0)) == (255, 255, 255, 0)
def test_should_calculate_default_bounding_box(self): text = SVG_TEXT('a', { 'x': '10', 'y': '11', 'font-size': '100' }) doc = SvgStructuredDocument(E.svg(SVG_TEXT_LINE(text))) assert doc.get_bounding_box(text) == BoundingBox(10, 11, 100 * 0.8, 100)
def test_should_create_rect_for_single_annotated_block(self): blocks = [AnnotationBlock(TAG1, BoundingBox(0, 0, 1, 1))] image = annotated_blocks_to_image(blocks, color_map={TAG1: (0, 255, 0)}, width=3, height=3) assert image.getpixel((0, 0)) == (0, 255, 0, 255)
def test_should_calculate_default_bounding_box(self): token = E.TOKEN({ 'x': '10', 'y': '11', 'width': '100', 'height': '101' }) doc = LxmlStructuredDocument(E.DOCUMENT(E.PAGE(E.TEXT(token)))) assert doc.get_bounding_box(token) == BoundingBox(10, 11, 100, 101)
def test_should_use_bounding_box_if_available(self): bounding_box = BoundingBox(11, 12, 101, 102) text = SVG_TEXT('a', { 'x': '10', 'y': '11', 'font-size': '100', SVGE_BOUNDING_BOX: format_bounding_box(bounding_box) }) doc = SvgStructuredDocument(E.svg(SVG_TEXT_LINE(text))) assert doc.get_bounding_box(text) == bounding_box
def test_should_be_able_to_set_bounding_box(self): bounding_box = BoundingBox(10, 11, 100, 101) token = E.TOKEN({ 'x': '20', 'y': '21', 'width': '200', 'height': '201' }) doc = LxmlStructuredDocument(E.DOCUMENT(E.PAGE(E.TEXT(token)))) doc.set_bounding_box(token, bounding_box) assert doc.get_bounding_box(token) == bounding_box
def test_should_estimate_width_based_on_number_of_characters(self): s = 'abc' text = SVG_TEXT(s, { 'x': '10', 'y': '11', 'font-size': '100' }) doc = SvgStructuredDocument(E.svg(SVG_TEXT_LINE(text))) assert doc.get_bounding_box(text) == BoundingBox( 10, 11, 100 * 0.8 * len(s), 100 )
def get_node_bounding_box(t): attrib = t.attrib if SVGE_BOUNDING_BOX in attrib: return parse_bounding_box(attrib[SVGE_BOUNDING_BOX]) if SVG_VIEWBOX_ATTRIB in attrib: return parse_bounding_box(attrib[SVG_VIEWBOX_ATTRIB]) if not ('font-size' in attrib and 'x' in attrib and 'y' in attrib): return None font_size = float(attrib['font-size']) width = font_size * 0.8 * max(1, len(t.text)) return BoundingBox(float(attrib['x']), float(attrib['y']), width, font_size)
def test_should_merge_right_nearby_block_with_same_tag_using_fractions( self): block1 = AnnotationBlock(TAG1, BoundingBox(10.5, 10.5, 10.9, 26.3)) block2 = AnnotationBlock( TAG1, block1.bounding_box.move_by( block1.bounding_box.width + DEFAULT_NEARBY_TOLERANCE, 0)) merged_blocks = merge_blocks([block1, block2], nearby_tolerance=DEFAULT_NEARBY_TOLERANCE) assert [b.tag for b in merged_blocks] == [TAG1] assert merged_blocks[0].bounding_box == block1.bounding_box.include( block2.bounding_box)
from sciencebeam_gym.structured_document.svg import (SVG_NS) from sciencebeam_gym.preprocess.blockify_annotations import ( annotation_document_page_to_annotation_blocks, annotation_document_page_to_merged_blocks, annotated_blocks_to_svg, annotated_blocks_to_image, merge_blocks, AnnotationBlock) TAG1 = 'tag1' TAG2 = 'tag2' DEFAULT_SVGX_WIDTH = 10 DEFAULT_FONT_SIZE = 10 DEFAULT_COLOR = 'red' DEFAULT_BOUNDING_BOX = BoundingBox(0, 0, 16, 10) DEFAULT_NEARBY_TOLERANCE = 10 def setup_module(): logging.basicConfig(level=logging.DEBUG) class TestAnnotatedBlocksToSvg(object): def test_should_create_rect_for_single_annotated_block(self): blocks = [AnnotationBlock(TAG1, DEFAULT_BOUNDING_BOX)] result_svg = annotated_blocks_to_svg(blocks, color_map={TAG1: DEFAULT_COLOR}, width=100,
def test_should_not_equal_none(self): assert not BoundingBox(11, 12, 101, 102).__eq__(None)
def test_should_include_another_bounding_box_to_the_top_left(self): assert (BoundingBox(100, 100, 200, 200).include( BoundingBox(10, 20, 50, 100)) == BoundingBox(10, 20, 100 + 200 - 10, 100 + 200 - 20))
def test_should_indicate_empty_with_zero_height(self): assert BoundingBox(0, 0, 100, 0).empty()
def parse_bounding_box(bounding_box_str): if not bounding_box_str: return None x, y, width, height = bounding_box_str.split() return BoundingBox(float(x), float(y), float(width), float(height))
SimplePage, SimpleLine, SimpleToken ) from sciencebeam_gym.models.text.feature_extractor import ( structured_document_to_token_props, token_props_list_to_features, token_props_list_to_labels, remove_labels_from_token_props_list, merge_with_cv_structured_document, NONE_TAG, CV_TAG_SCOPE ) PAGE_BOUNDING_BOX = BoundingBox(0, 0, 100, 200) TOKEN_BOUNDING_BOX = BoundingBox(10, 10, 10, 20) TEXT_1 = 'Text 1' TEXT_2 = 'Text 2' TEXT_3 = 'Text 3' TAG_1 = 'tag1' TAG_2 = 'tag2' TAG_3 = 'tag3' SCOPE_1 = 'scope1' class TestStructuredDocumentToTokenProps(object): def test_should_return_empty_token_list_if_document_has_no_pages(self):
def scale_bounding_box(bounding_box, rx, ry): return BoundingBox(bounding_box.x * rx, bounding_box.y * ry, bounding_box.width * rx, bounding_box.height * ry)
annotate_structured_document_using_predicted_images, parse_args, CV_TAG_SCOPE ) TAG_1 = 'tag1' TOKEN_TEXT_1 = 'a token value' COLOR_1 = (1, 1, 1) BG_COLOR = (255, 255, 255) DEFAULT_HEIGHT = 3 DEFAULT_WIDTH = 3 DEFAULT_BOUNDING_BOX = BoundingBox(0, 0, DEFAULT_WIDTH, DEFAULT_HEIGHT) def filled_image(color, color_map, width=DEFAULT_WIDTH, height=DEFAULT_HEIGHT): return AnnotatedImage( np.full((height, width, 3), color), color_map ) def fill_rect(annoted_image, bounding_box, color): for y in range(bounding_box.y, bounding_box.y + bounding_box.height): for x in range(bounding_box.x, bounding_box.x + bounding_box.width): annoted_image.data[y, x] = color
def test_should_return_zero_tag_probality_if_bounding_box_is_empty(self): annotated_image = filled_image(BG_COLOR, {TAG_1: COLOR_1}) assert annotated_image.get_tag_probabilities_within( BoundingBox(0, 0, 0, 0) ).get(TAG_1) == 0.0
def iter_svg_pages_for_lxml(lxml_root, add_background=True): previous_block = None previous_svg_block = None for page in lxml_root.xpath('//DOCUMENT/PAGE'): svg_root = etree.Element(SVG_DOC, nsmap=SVG_NSMAP) page_width = page.attrib.get('width') page_height = page.attrib.get('height') if page_width and page_height: svg_root.attrib['viewBox'] = '0 0 %s %s' % (page_width, page_height) if add_background: svg_root.append( etree.Element(SVG_RECT, attrib={ 'width': '100%', 'height': '100%', 'fill': 'white', 'class': 'background' })) for text in page.xpath('.//TEXT'): svg_g = etree.Element(SVG_G, nsmap=SVG_NSMAP, attrib={'class': SvgStyleClasses.LINE}) for token in text.xpath('./TOKEN'): x = float(token.attrib.get('x')) y = float(token.attrib.get('y')) height = float(token.attrib.get('height')) width = float(token.attrib.get('width')) base = float(token.attrib.get('base', y)) y_center = y + height / 2.0 attrib = { 'x': str(x), 'y': str(base), 'font-size': token.attrib.get('font-size'), 'font-family': token.attrib.get('font-name'), 'fill': token.attrib.get('font-color'), SVGE_BOUNDING_BOX: svg_format_bounding_box(BoundingBox(x, y, width, height)) } angle = float(token.attrib.get('angle', '0')) if token.attrib.get('rotation') == '1' and angle == 90.0: attrib['x'] = '0' attrib['y'] = '0' attrib[ 'transform'] = 'translate({x} {y}) rotate({angle})'.format( x=str(x), y=str(y_center), angle=str(-angle)) svg_g.append( ElementWithText(SVG_TEXT, token.text, attrib=attrib)) text_parent = text.getparent() if text_parent.tag == 'BLOCK': if text_parent != previous_block: previous_svg_block = etree.Element( SVG_G, nsmap=SVG_NSMAP, attrib={'class': SvgStyleClasses.BLOCK}) svg_root.append(previous_svg_block) previous_block = text_parent previous_svg_block.append(svg_g) else: previous_block = None previous_svg_block = None svg_root.append(svg_g) yield svg_root
def test_should_not_equal_bounding_boxes_with_different_height(self): assert BoundingBox(11, 12, 101, 102) != BoundingBox(11, 12, 101, 999)
from sciencebeam_gym.models.text.feature_extractor import ( structured_document_to_token_props, token_props_list_to_features, NONE_TAG) from sciencebeam_gym.utils.bounding_box import (BoundingBox) from sciencebeam_gym.models.text.crf.annotate_using_predictions import ( annotate_structured_document_using_predictions, predict_and_annotate_structured_document, CRF_TAG_SCOPE) TAG_1 = 'tag1' TOKEN_TEXT_1 = 'token 1' TOKEN_TEXT_2 = 'token 2' BOUNDING_BOX = BoundingBox(0, 0, 10, 10) class TestAnnotateStructuredDocumentUsingPredictions(object): def test_should_not_fail_with_empty_document(self): structured_document = SimpleStructuredDocument() annotate_structured_document_using_predictions(structured_document, []) def test_should_tag_single_token_using_prediction(self): token_1 = SimpleToken(TOKEN_TEXT_1) structured_document = SimpleStructuredDocument( lines=[SimpleLine([token_1])]) annotate_structured_document_using_predictions(structured_document, [TAG_1]) assert structured_document.get_tag(token_1, scope=CRF_TAG_SCOPE) == TAG_1
def test_should_equal_same_bounding_boxes(self): assert BoundingBox(11, 12, 101, 102) == BoundingBox(11, 12, 101, 102)