Example #1
0
    def test_should_learn_similar_sequence(self):
        structured_document_train = SimpleStructuredDocument(
            SimplePage([
                SimpleLine([
                    SimpleToken(TEXT_1, tag=TAG_1),
                    SimpleToken(TEXT_1, tag=TAG_1),
                    SimpleToken(TEXT_2, tag=TAG_2),
                    SimpleToken(TEXT_3, tag=TAG_3)
                ])
            ],
                       bounding_box=PAGE_BOUNDING_BOX))
        structured_document_test = SimpleStructuredDocument(
            SimplePage([
                SimpleLine([
                    SimpleToken(TEXT_1, tag=TAG_1),
                    SimpleToken(TEXT_2, tag=TAG_2),
                    SimpleToken(TEXT_3, tag=TAG_3)
                ])
            ],
                       bounding_box=PAGE_BOUNDING_BOX))
        token_props_list_train = list(
            structured_document_to_token_props(structured_document_train))
        X_train = [token_props_list_to_features(token_props_list_train)]
        y_train = [token_props_list_to_labels(token_props_list_train)]

        token_props_list_test = list(
            structured_document_to_token_props(structured_document_test))
        X_test = [token_props_list_to_features(token_props_list_test)]
        y_test = [token_props_list_to_labels(token_props_list_test)]

        with create_crf_suite_model() as model:
            model.fit(X_train, y_train)
            y_predicted = model.predict(X_test)
            assert y_predicted == y_test
 def test_should_return_empty_token_list_if_document_has_no_lines(self):
     structured_document = SimpleStructuredDocument(
         SimplePage([], bounding_box=PAGE_BOUNDING_BOX)
     )
     assert list(structured_document_to_token_props(
         structured_document
     )) == []
Example #3
0
 def test_should_return_page_width_and_height(self):
     structured_document = SimpleStructuredDocument([
         SimplePage([SimpleLine([SimpleToken(TEXT_1)])],
                    bounding_box=PAGE_BOUNDING_BOX)
     ])
     result = list(structured_document_to_token_props(structured_document))
     pages = [t.get('page') for t in result]
     assert [p.get('width') for p in pages] == [PAGE_BOUNDING_BOX.width]
     assert [p.get('height') for p in pages] == [PAGE_BOUNDING_BOX.height]
 def test_should_return_single_token_text(self):
     structured_document = SimpleStructuredDocument(
         SimplePage([SimpleLine([
             SimpleToken(TEXT_1)
         ])], bounding_box=PAGE_BOUNDING_BOX)
     )
     result = list(structured_document_to_token_props(
         structured_document
     ))
     assert [t.get('text') for t in result] == [TEXT_1]
 def test_should_return_bounding_box(self):
     structured_document = SimpleStructuredDocument([
         SimplePage([SimpleLine([
             SimpleToken(TEXT_1, bounding_box=TOKEN_BOUNDING_BOX)
         ])], bounding_box=PAGE_BOUNDING_BOX)
     ])
     result = list(structured_document_to_token_props(
         structured_document
     ))
     assert [t.get('bounding_box') for t in result] == [TOKEN_BOUNDING_BOX]
 def test_should_raise_error_if_token_props_do_not_match(self):
     token_1 = SimpleToken(TOKEN_TEXT_1, bounding_box=BOUNDING_BOX)
     structured_document = SimpleStructuredDocument(
         SimplePage(lines=[SimpleLine([token_1])],
                    bounding_box=BOUNDING_BOX))
     token_props_list = list(
         structured_document_to_token_props(structured_document))
     token_props_list[0]['text'] = TOKEN_TEXT_2
     with pytest.raises(AssertionError):
         annotate_structured_document_using_predictions(
             structured_document, [TAG_1], token_props_list)
def predict_and_annotate_structured_document(structured_document,
                                             model,
                                             tag_scope=CRF_TAG_SCOPE):
    token_props = list(structured_document_to_token_props(structured_document))
    x = token_props_list_to_features(token_props)
    y_pred = model.predict([x])[0]
    annotate_structured_document_using_predictions(structured_document,
                                                   y_pred,
                                                   token_props,
                                                   tag_scope=tag_scope)
    return structured_document
 def test_should_tag_single_token_using_prediction_and_check_token_props(
         self):
     token_1 = SimpleToken(TOKEN_TEXT_1, bounding_box=BOUNDING_BOX)
     structured_document = SimpleStructuredDocument(
         SimplePage(lines=[SimpleLine([token_1])],
                    bounding_box=BOUNDING_BOX))
     token_props_list = structured_document_to_token_props(
         structured_document)
     annotate_structured_document_using_predictions(structured_document,
                                                    [TAG_1],
                                                    token_props_list)
     assert structured_document.get_tag(token_1,
                                        scope=CRF_TAG_SCOPE) == TAG_1
Example #9
0
 def test_should_return_tag(self):
     structured_document = SimpleStructuredDocument([
         SimplePage([
             SimpleLine(
                 [SimpleToken(TEXT_1, tag=TAG_1),
                  SimpleToken(TEXT_2)])
         ],
                    bounding_box=PAGE_BOUNDING_BOX),
         SimplePage([SimpleLine([SimpleToken(TEXT_3, tag=TAG_3)])],
                    bounding_box=PAGE_BOUNDING_BOX)
     ])
     result = list(structured_document_to_token_props(structured_document))
     assert [t.get('tag') for t in result] == [TAG_1, None, TAG_3]
Example #10
0
 def test_should_return_line_token_index_and_page_count(self):
     structured_document = SimpleStructuredDocument([
         SimplePage(
             [SimpleLine([SimpleToken(TEXT_1),
                          SimpleToken(TEXT_2)])],
             bounding_box=PAGE_BOUNDING_BOX),
         SimplePage([SimpleLine([SimpleToken(TEXT_3)])],
                    bounding_box=PAGE_BOUNDING_BOX)
     ])
     result = list(structured_document_to_token_props(structured_document))
     line_tokens = [t.get('line_token') for t in result]
     assert [t.get('index') for t in line_tokens] == [0, 1, 0]
     assert [t.get('count') for t in line_tokens] == [2, 2, 1]
 def test_should_predict_and_annotate_single_token(self):
     token_1 = SimpleToken(TOKEN_TEXT_1, bounding_box=BOUNDING_BOX)
     structured_document = SimpleStructuredDocument(
         SimplePage(lines=[SimpleLine([token_1])],
                    bounding_box=BOUNDING_BOX))
     model = MagicMock()
     model.predict.return_value = [[TAG_1]]
     token_props = list(
         structured_document_to_token_props(structured_document))
     X = [token_props_list_to_features(token_props)]
     predict_and_annotate_structured_document(structured_document, model)
     assert structured_document.get_tag(token_1,
                                        scope=CRF_TAG_SCOPE) == TAG_1
     model.predict.assert_called_with(X)
 def test_should_return_scoped_tags(self):
     structured_document = SimpleStructuredDocument([
         SimplePage([SimpleLine([
             SimpleToken(TEXT_1, tag=TAG_1),
             SimpleToken(TEXT_2)
         ])], bounding_box=PAGE_BOUNDING_BOX),
         SimplePage([SimpleLine([
             SimpleToken(TEXT_3, tag=TAG_3, tag_scope=SCOPE_1)
         ])], bounding_box=PAGE_BOUNDING_BOX)
     ])
     result = list(structured_document_to_token_props(
         structured_document
     ))
     assert [t.get('scoped_tags') for t in result] == [{}, {}, {SCOPE_1: TAG_3}]
def load_and_convert_to_token_props(filename,
                                    cv_filename,
                                    cv_source_tag_scope,
                                    page_range=None):
    try:
        structured_document = load_structured_document(filename,
                                                       page_range=page_range)
        if cv_filename:
            cv_structured_document = load_structured_document(
                cv_filename, page_range=page_range)
            structured_document = merge_with_cv_structured_document(
                structured_document,
                cv_structured_document,
                cv_source_tag_scope=cv_source_tag_scope)
        return list(structured_document_to_token_props(structured_document))
    except StandardError as e:
        raise_from(
            RuntimeError('failed to process %s (due to %s: %s)' %
                         (filename, type(e), e)), e)
Example #14
0
 def test_should_learn_simple_sequence(self):
     structured_document = SimpleStructuredDocument(
         SimplePage([
             SimpleLine([
                 SimpleToken(TEXT_1, tag=TAG_1),
                 SimpleToken(TEXT_2, tag=TAG_2),
                 SimpleToken(TEXT_3, tag=TAG_3)
             ])
         ],
                    bounding_box=PAGE_BOUNDING_BOX))
     token_props_list = list(
         structured_document_to_token_props(structured_document))
     get_logger().debug('token_props_list:\n%s', token_props_list)
     X = [token_props_list_to_features(token_props_list)]
     y = [token_props_list_to_labels(token_props_list)]
     get_logger().debug('X:\n%s', X)
     get_logger().debug('y:\n%s', y)
     with create_crf_suite_model() as model:
         model.fit(X, y)
         y_predicted = model.predict(X)
         assert y_predicted == y
Example #15
0
    def test_should_pickle_and_unpickle_model(self):
        structured_document = SimpleStructuredDocument(
            SimplePage([
                SimpleLine([
                    SimpleToken(TEXT_1, tag=TAG_1),
                    SimpleToken(TEXT_2, tag=TAG_2),
                    SimpleToken(TEXT_3, tag=TAG_3)
                ])
            ],
                       bounding_box=PAGE_BOUNDING_BOX))
        token_props_list = list(
            structured_document_to_token_props(structured_document))
        X = [token_props_list_to_features(token_props_list)]
        y = [token_props_list_to_labels(token_props_list)]
        with create_crf_suite_model() as model:
            model.fit(X, y)
            serialized_model = pickle.dumps(model)

        model = pickle.loads(serialized_model)
        y_predicted = model.predict(X)
        assert y_predicted == y
 def test_should_return_empty_token_list_if_document_has_no_pages(self):
     structured_document = SimpleStructuredDocument([])
     assert list(structured_document_to_token_props(
         structured_document
     )) == []