Exemple #1
0
 def test_should_tag_single_token_within_partial_prediction_at_smaller_scale(self):
     token_1 = SimpleToken(TOKEN_TEXT_1)
     structured_document = SimpleStructuredDocument(lines=[SimpleLine([token_1])])
     structured_document.set_bounding_box(
         structured_document.get_pages()[0],
         BoundingBox(0, 0, DEFAULT_WIDTH * 100, DEFAULT_HEIGHT * 100)
     )
     structured_document.set_bounding_box(
         token_1,
         BoundingBox(0, 0, DEFAULT_WIDTH * 10, DEFAULT_HEIGHT * 10)
     )
     annotated_image = filled_image(
         BG_COLOR, {TAG_1: COLOR_1},
         width=DEFAULT_WIDTH * 10,
         height=DEFAULT_HEIGHT * 10
     )
     fill_rect(
         annotated_image,
         BoundingBox(0, 0, DEFAULT_WIDTH, DEFAULT_HEIGHT),
         COLOR_1
     )
     annotate_structured_document_using_predicted_images(
         structured_document,
         [annotated_image]
     )
     assert structured_document.get_tag(token_1, scope=CV_TAG_SCOPE) == TAG_1
Exemple #2
0
 def test_should_calculate_bounding_box_of_page_without_xy(self):
     page = E.PAGE({
         'width': '100',
         'height': '101'
     })
     doc = LxmlStructuredDocument(E.DOCUMENT(page))
     assert doc.get_bounding_box(page) == BoundingBox(0, 0, 100, 101)
 def test_should_be_able_to_set_bounding_box(self):
     bounding_box = BoundingBox(11, 12, 101, 102)
     text = SVG_TEXT('a', {'x': '10', 'y': '11', 'font-size': '100'})
     doc = SvgStructuredDocument(E.svg(SVG_TEXT_LINE(text)))
     doc.set_bounding_box(text, bounding_box)
     assert text.attrib[SVGE_BOUNDING_BOX] == format_bounding_box(
         bounding_box)
Exemple #4
0
def get_node_bounding_box(t):
    return BoundingBox(
        float(t.attrib.get('x', 0)),
        float(t.attrib.get('y', 0)),
        float(t.attrib['width']),
        float(t.attrib['height'])
    )
Exemple #5
0
 def test_should_use_viewbox_if_available(self):
     bounding_box = BoundingBox(11, 12, 101, 102)
     page = E.svg({
         SVG_VIEWBOX_ATTRIB: format_bounding_box(bounding_box)
     })
     doc = SvgStructuredDocument(page)
     assert doc.get_bounding_box(page) == bounding_box
Exemple #6
0
 def test_should_create_text_node_with_common_attributes(self):
     lxml_root = E.DOCUMENT(
         E.PAGE(
             E.TEXT(
                 E.TOKEN(
                     SOME_TEXT,
                     COMMON_LXML_TOKEN_ATTRIBS
                 )
             )
         )
     )
     svg_pages = list(iter_svg_pages_for_lxml(lxml_root))
     assert len(svg_pages) == 1
     first_page = svg_pages[0]
     svg_text = first_page.find('.//' + SVG_TEXT)
     assert svg_text is not None
     assert svg_text.text == SOME_TEXT
     assert float(svg_text.attrib[SVG.X]) == float(SOME_X)
     assert float(svg_text.attrib[SVG.Y]) == float(SOME_Y)
     assert float(svg_text.attrib[SVG.FONT_SIZE]) == float(SOME_FONT_SIZE)
     assert svg_text.attrib[SVG.FONT_FAMILY] == SOME_FONT_FAMILY
     assert svg_text.attrib[SVG.FILL] == SOME_FONT_COLOR
     assert parse_bounding_box(svg_text.attrib.get(SVG.BOUNDING_BOX)) == BoundingBox(
         float(COMMON_LXML_TOKEN_ATTRIBS[LXML.X]),
         float(COMMON_LXML_TOKEN_ATTRIBS[LXML.Y]),
         float(COMMON_LXML_TOKEN_ATTRIBS[LXML.WIDTH]),
         float(COMMON_LXML_TOKEN_ATTRIBS[LXML.HEIGHT])
     )
    def test_should_accept_float_image_size(self):
        blocks = [AnnotationBlock(TAG1, BoundingBox(0, 0, 1, 1))]

        image = annotated_blocks_to_image(blocks,
                                          color_map={TAG1: (0, 255, 0)},
                                          width=3.1,
                                          height=3.9)
        assert image.size == (4, 4)
    def test_should_convert_rect_color_name(self):
        blocks = [AnnotationBlock(TAG1, BoundingBox(0, 0, 1, 1))]

        image = annotated_blocks_to_image(blocks,
                                          color_map={TAG1: 'green'},
                                          width=3,
                                          height=3)
        assert image.getpixel((0, 0)) == (0, 128, 0, 255)
    def test_should_ignore_unmapped_tag(self):
        blocks = [AnnotationBlock(TAG1, BoundingBox(0, 0, 1, 1))]

        image = annotated_blocks_to_image(blocks,
                                          color_map={},
                                          width=3,
                                          height=3)
        assert image.getpixel((0, 0)) == (255, 255, 255, 0)
Exemple #10
0
 def test_should_calculate_default_bounding_box(self):
     text = SVG_TEXT('a', {
         'x': '10',
         'y': '11',
         'font-size': '100'
     })
     doc = SvgStructuredDocument(E.svg(SVG_TEXT_LINE(text)))
     assert doc.get_bounding_box(text) == BoundingBox(10, 11, 100 * 0.8, 100)
    def test_should_create_rect_for_single_annotated_block(self):
        blocks = [AnnotationBlock(TAG1, BoundingBox(0, 0, 1, 1))]

        image = annotated_blocks_to_image(blocks,
                                          color_map={TAG1: (0, 255, 0)},
                                          width=3,
                                          height=3)
        assert image.getpixel((0, 0)) == (0, 255, 0, 255)
Exemple #12
0
 def test_should_calculate_default_bounding_box(self):
     token = E.TOKEN({
         'x': '10',
         'y': '11',
         'width': '100',
         'height': '101'
     })
     doc = LxmlStructuredDocument(E.DOCUMENT(E.PAGE(E.TEXT(token))))
     assert doc.get_bounding_box(token) == BoundingBox(10, 11, 100, 101)
Exemple #13
0
 def test_should_use_bounding_box_if_available(self):
     bounding_box = BoundingBox(11, 12, 101, 102)
     text = SVG_TEXT('a', {
         'x': '10',
         'y': '11',
         'font-size': '100',
         SVGE_BOUNDING_BOX: format_bounding_box(bounding_box)
     })
     doc = SvgStructuredDocument(E.svg(SVG_TEXT_LINE(text)))
     assert doc.get_bounding_box(text) == bounding_box
Exemple #14
0
 def test_should_be_able_to_set_bounding_box(self):
     bounding_box = BoundingBox(10, 11, 100, 101)
     token = E.TOKEN({
         'x': '20',
         'y': '21',
         'width': '200',
         'height': '201'
     })
     doc = LxmlStructuredDocument(E.DOCUMENT(E.PAGE(E.TEXT(token))))
     doc.set_bounding_box(token, bounding_box)
     assert doc.get_bounding_box(token) == bounding_box
Exemple #15
0
 def test_should_estimate_width_based_on_number_of_characters(self):
     s = 'abc'
     text = SVG_TEXT(s, {
         'x': '10',
         'y': '11',
         'font-size': '100'
     })
     doc = SvgStructuredDocument(E.svg(SVG_TEXT_LINE(text)))
     assert doc.get_bounding_box(text) == BoundingBox(
         10, 11, 100 * 0.8 * len(s), 100
     )
Exemple #16
0
def get_node_bounding_box(t):
    attrib = t.attrib
    if SVGE_BOUNDING_BOX in attrib:
        return parse_bounding_box(attrib[SVGE_BOUNDING_BOX])
    if SVG_VIEWBOX_ATTRIB in attrib:
        return parse_bounding_box(attrib[SVG_VIEWBOX_ATTRIB])
    if not ('font-size' in attrib and 'x' in attrib and 'y' in attrib):
        return None
    font_size = float(attrib['font-size'])
    width = font_size * 0.8 * max(1, len(t.text))
    return BoundingBox(float(attrib['x']), float(attrib['y']), width,
                       font_size)
    def test_should_merge_right_nearby_block_with_same_tag_using_fractions(
            self):
        block1 = AnnotationBlock(TAG1, BoundingBox(10.5, 10.5, 10.9, 26.3))
        block2 = AnnotationBlock(
            TAG1,
            block1.bounding_box.move_by(
                block1.bounding_box.width + DEFAULT_NEARBY_TOLERANCE, 0))

        merged_blocks = merge_blocks([block1, block2],
                                     nearby_tolerance=DEFAULT_NEARBY_TOLERANCE)
        assert [b.tag for b in merged_blocks] == [TAG1]
        assert merged_blocks[0].bounding_box == block1.bounding_box.include(
            block2.bounding_box)
from sciencebeam_gym.structured_document.svg import (SVG_NS)

from sciencebeam_gym.preprocess.blockify_annotations import (
    annotation_document_page_to_annotation_blocks,
    annotation_document_page_to_merged_blocks, annotated_blocks_to_svg,
    annotated_blocks_to_image, merge_blocks, AnnotationBlock)

TAG1 = 'tag1'
TAG2 = 'tag2'

DEFAULT_SVGX_WIDTH = 10
DEFAULT_FONT_SIZE = 10
DEFAULT_COLOR = 'red'

DEFAULT_BOUNDING_BOX = BoundingBox(0, 0, 16, 10)

DEFAULT_NEARBY_TOLERANCE = 10


def setup_module():
    logging.basicConfig(level=logging.DEBUG)


class TestAnnotatedBlocksToSvg(object):
    def test_should_create_rect_for_single_annotated_block(self):
        blocks = [AnnotationBlock(TAG1, DEFAULT_BOUNDING_BOX)]

        result_svg = annotated_blocks_to_svg(blocks,
                                             color_map={TAG1: DEFAULT_COLOR},
                                             width=100,
Exemple #19
0
 def test_should_not_equal_none(self):
     assert not BoundingBox(11, 12, 101, 102).__eq__(None)
Exemple #20
0
 def test_should_include_another_bounding_box_to_the_top_left(self):
     assert (BoundingBox(100, 100, 200, 200).include(
         BoundingBox(10, 20, 50,
                     100)) == BoundingBox(10, 20, 100 + 200 - 10,
                                          100 + 200 - 20))
Exemple #21
0
 def test_should_indicate_empty_with_zero_height(self):
     assert BoundingBox(0, 0, 100, 0).empty()
Exemple #22
0
def parse_bounding_box(bounding_box_str):
    if not bounding_box_str:
        return None
    x, y, width, height = bounding_box_str.split()
    return BoundingBox(float(x), float(y), float(width), float(height))
    SimplePage,
    SimpleLine,
    SimpleToken
)

from sciencebeam_gym.models.text.feature_extractor import (
    structured_document_to_token_props,
    token_props_list_to_features,
    token_props_list_to_labels,
    remove_labels_from_token_props_list,
    merge_with_cv_structured_document,
    NONE_TAG,
    CV_TAG_SCOPE
)

PAGE_BOUNDING_BOX = BoundingBox(0, 0, 100, 200)
TOKEN_BOUNDING_BOX = BoundingBox(10, 10, 10, 20)

TEXT_1 = 'Text 1'
TEXT_2 = 'Text 2'
TEXT_3 = 'Text 3'

TAG_1 = 'tag1'
TAG_2 = 'tag2'
TAG_3 = 'tag3'

SCOPE_1 = 'scope1'


class TestStructuredDocumentToTokenProps(object):
    def test_should_return_empty_token_list_if_document_has_no_pages(self):
def scale_bounding_box(bounding_box, rx, ry):
    return BoundingBox(bounding_box.x * rx, bounding_box.y * ry,
                       bounding_box.width * rx, bounding_box.height * ry)
Exemple #25
0
    annotate_structured_document_using_predicted_images,
    parse_args,
    CV_TAG_SCOPE
)


TAG_1 = 'tag1'

TOKEN_TEXT_1 = 'a token value'

COLOR_1 = (1, 1, 1)
BG_COLOR = (255, 255, 255)

DEFAULT_HEIGHT = 3
DEFAULT_WIDTH = 3
DEFAULT_BOUNDING_BOX = BoundingBox(0, 0, DEFAULT_WIDTH, DEFAULT_HEIGHT)


def filled_image(color, color_map, width=DEFAULT_WIDTH, height=DEFAULT_HEIGHT):
    return AnnotatedImage(
        np.full((height, width, 3), color),
        color_map
    )


def fill_rect(annoted_image, bounding_box, color):
    for y in range(bounding_box.y, bounding_box.y + bounding_box.height):
        for x in range(bounding_box.x, bounding_box.x + bounding_box.width):
            annoted_image.data[y, x] = color

Exemple #26
0
 def test_should_return_zero_tag_probality_if_bounding_box_is_empty(self):
     annotated_image = filled_image(BG_COLOR, {TAG_1: COLOR_1})
     assert annotated_image.get_tag_probabilities_within(
         BoundingBox(0, 0, 0, 0)
     ).get(TAG_1) == 0.0
Exemple #27
0
def iter_svg_pages_for_lxml(lxml_root, add_background=True):
    previous_block = None
    previous_svg_block = None
    for page in lxml_root.xpath('//DOCUMENT/PAGE'):
        svg_root = etree.Element(SVG_DOC, nsmap=SVG_NSMAP)
        page_width = page.attrib.get('width')
        page_height = page.attrib.get('height')
        if page_width and page_height:
            svg_root.attrib['viewBox'] = '0 0 %s %s' % (page_width,
                                                        page_height)
        if add_background:
            svg_root.append(
                etree.Element(SVG_RECT,
                              attrib={
                                  'width': '100%',
                                  'height': '100%',
                                  'fill': 'white',
                                  'class': 'background'
                              }))
        for text in page.xpath('.//TEXT'):
            svg_g = etree.Element(SVG_G,
                                  nsmap=SVG_NSMAP,
                                  attrib={'class': SvgStyleClasses.LINE})
            for token in text.xpath('./TOKEN'):
                x = float(token.attrib.get('x'))
                y = float(token.attrib.get('y'))
                height = float(token.attrib.get('height'))
                width = float(token.attrib.get('width'))
                base = float(token.attrib.get('base', y))
                y_center = y + height / 2.0
                attrib = {
                    'x':
                    str(x),
                    'y':
                    str(base),
                    'font-size':
                    token.attrib.get('font-size'),
                    'font-family':
                    token.attrib.get('font-name'),
                    'fill':
                    token.attrib.get('font-color'),
                    SVGE_BOUNDING_BOX:
                    svg_format_bounding_box(BoundingBox(x, y, width, height))
                }
                angle = float(token.attrib.get('angle', '0'))
                if token.attrib.get('rotation') == '1' and angle == 90.0:
                    attrib['x'] = '0'
                    attrib['y'] = '0'
                    attrib[
                        'transform'] = 'translate({x} {y}) rotate({angle})'.format(
                            x=str(x), y=str(y_center), angle=str(-angle))
                svg_g.append(
                    ElementWithText(SVG_TEXT, token.text, attrib=attrib))
            text_parent = text.getparent()
            if text_parent.tag == 'BLOCK':
                if text_parent != previous_block:
                    previous_svg_block = etree.Element(
                        SVG_G,
                        nsmap=SVG_NSMAP,
                        attrib={'class': SvgStyleClasses.BLOCK})
                    svg_root.append(previous_svg_block)
                    previous_block = text_parent
                previous_svg_block.append(svg_g)
            else:
                previous_block = None
                previous_svg_block = None
                svg_root.append(svg_g)
        yield svg_root
Exemple #28
0
 def test_should_not_equal_bounding_boxes_with_different_height(self):
     assert BoundingBox(11, 12, 101, 102) != BoundingBox(11, 12, 101, 999)
from sciencebeam_gym.models.text.feature_extractor import (
    structured_document_to_token_props, token_props_list_to_features, NONE_TAG)

from sciencebeam_gym.utils.bounding_box import (BoundingBox)

from sciencebeam_gym.models.text.crf.annotate_using_predictions import (
    annotate_structured_document_using_predictions,
    predict_and_annotate_structured_document, CRF_TAG_SCOPE)

TAG_1 = 'tag1'

TOKEN_TEXT_1 = 'token 1'
TOKEN_TEXT_2 = 'token 2'

BOUNDING_BOX = BoundingBox(0, 0, 10, 10)


class TestAnnotateStructuredDocumentUsingPredictions(object):
    def test_should_not_fail_with_empty_document(self):
        structured_document = SimpleStructuredDocument()
        annotate_structured_document_using_predictions(structured_document, [])

    def test_should_tag_single_token_using_prediction(self):
        token_1 = SimpleToken(TOKEN_TEXT_1)
        structured_document = SimpleStructuredDocument(
            lines=[SimpleLine([token_1])])
        annotate_structured_document_using_predictions(structured_document,
                                                       [TAG_1])
        assert structured_document.get_tag(token_1,
                                           scope=CRF_TAG_SCOPE) == TAG_1
Exemple #30
0
 def test_should_equal_same_bounding_boxes(self):
     assert BoundingBox(11, 12, 101, 102) == BoundingBox(11, 12, 101, 102)