コード例 #1
0
    def test_should_learn_similar_sequence(self):
        structured_document_train = SimpleStructuredDocument(
            SimplePage([
                SimpleLine([
                    SimpleToken(TEXT_1, tag=TAG_1),
                    SimpleToken(TEXT_1, tag=TAG_1),
                    SimpleToken(TEXT_2, tag=TAG_2),
                    SimpleToken(TEXT_3, tag=TAG_3)
                ])
            ],
                       bounding_box=PAGE_BOUNDING_BOX))
        structured_document_test = SimpleStructuredDocument(
            SimplePage([
                SimpleLine([
                    SimpleToken(TEXT_1, tag=TAG_1),
                    SimpleToken(TEXT_2, tag=TAG_2),
                    SimpleToken(TEXT_3, tag=TAG_3)
                ])
            ],
                       bounding_box=PAGE_BOUNDING_BOX))
        token_props_list_train = list(
            structured_document_to_token_props(structured_document_train))
        X_train = [token_props_list_to_features(token_props_list_train)]
        y_train = [token_props_list_to_labels(token_props_list_train)]

        token_props_list_test = list(
            structured_document_to_token_props(structured_document_test))
        X_test = [token_props_list_to_features(token_props_list_test)]
        y_test = [token_props_list_to_labels(token_props_list_test)]

        with create_crf_suite_model() as model:
            model.fit(X_train, y_train)
            y_predicted = model.predict(X_test)
            assert y_predicted == y_test
コード例 #2
0
 def test_should_add_previous_and_next_token_word_features(self):
     result = token_props_list_to_features([
         create_token_props(TEXT_1),
         create_token_props(TEXT_2),
         create_token_props(TEXT_3)
     ])
     assert [x.get('word.lower') for x in result] == [
         TEXT_1.lower(), TEXT_2.lower(), TEXT_3.lower()
     ]
     assert [x.get('-2:word.lower') for x in result] == [
         None, None, TEXT_1.lower()
     ]
     assert [x.get('-1:word.lower') for x in result] == [
         None, TEXT_1.lower(), TEXT_2.lower()
     ]
     assert [x.get('1:word.lower') for x in result] == [
         TEXT_2.lower(), TEXT_3.lower(), None
     ]
     assert [x.get('2:word.lower') for x in result] == [
         TEXT_3.lower(), None, None
     ]
     assert [x.get('BOD[-2]') for x in result] == [
         True, True, None
     ]
     assert [x.get('BOD[-1]') for x in result] == [
         True, None, None
     ]
     assert [x.get('EOD[1]') for x in result] == [
         None, None, True
     ]
     assert [x.get('EOD[2]') for x in result] == [
         None, True, True
     ]
コード例 #3
0
 def test_should_extract_scoped_tags(self):
     token_props = create_token_props(TEXT_1)
     token_props['scoped_tags'] = {
         SCOPE_1: TAG_1
     }
     result = token_props_list_to_features([token_props])
     assert [x.get('%s.tag' % SCOPE_1) for x in result] == [TAG_1]
コード例 #4
0
def train_model(file_list,
                cv_file_list,
                cv_source_tag_scope,
                page_range=None,
                progress=True):

    stop_watch_recorder = StopWatchRecorder()
    model = CrfSuiteModel()

    stop_watch_recorder.start('loading files')
    token_props_list_by_document = load_token_props_list_by_document(
        file_list,
        cv_file_list,
        cv_source_tag_scope=cv_source_tag_scope,
        page_range=page_range,
        progress=progress)

    assert token_props_list_by_document

    stop_watch_recorder.start('converting to features')
    X = [token_props_list_to_features(x) for x in token_props_list_by_document]
    y = [token_props_list_to_labels(x) for x in token_props_list_by_document]

    get_logger().info('training model (with %d documents)', len(X))
    stop_watch_recorder.start('train')
    model.fit(X, y)

    stop_watch_recorder.start('serialize')
    serialized_model = serialize_model(model)

    stop_watch_recorder.stop()
    get_logger().info('timings: %s', stop_watch_recorder)

    return serialized_model
コード例 #5
0
 def test_should_extract_various_word_features(self):
     result = token_props_list_to_features([create_token_props('TestMe')])
     assert [x.get('word.lower') for x in result] == ['testme']
     assert [x.get('word[:1]') for x in result] == ['t']
     assert [x.get('word[-3:]') for x in result] == ['tme']
     assert [x.get('word[-2:]') for x in result] == ['me']
     assert [x.get('word[:1].isupper') for x in result] == [True]
     assert [x.get('word.isupper') for x in result] == [False]
     assert [x.get('word.isdigit') for x in result] == [False]
コード例 #6
0
def predict_and_annotate_structured_document(structured_document,
                                             model,
                                             tag_scope=CRF_TAG_SCOPE):
    token_props = list(structured_document_to_token_props(structured_document))
    x = token_props_list_to_features(token_props)
    y_pred = model.predict([x])[0]
    annotate_structured_document_using_predictions(structured_document,
                                                   y_pred,
                                                   token_props,
                                                   tag_scope=tag_scope)
    return structured_document
コード例 #7
0
 def test_should_predict_and_annotate_single_token(self):
     token_1 = SimpleToken(TOKEN_TEXT_1, bounding_box=BOUNDING_BOX)
     structured_document = SimpleStructuredDocument(
         SimplePage(lines=[SimpleLine([token_1])],
                    bounding_box=BOUNDING_BOX))
     model = MagicMock()
     model.predict.return_value = [[TAG_1]]
     token_props = list(
         structured_document_to_token_props(structured_document))
     X = [token_props_list_to_features(token_props)]
     predict_and_annotate_structured_document(structured_document, model)
     assert structured_document.get_tag(token_1,
                                        scope=CRF_TAG_SCOPE) == TAG_1
     model.predict.assert_called_with(X)
コード例 #8
0
 def test_should_learn_simple_sequence(self):
     structured_document = SimpleStructuredDocument(
         SimplePage([
             SimpleLine([
                 SimpleToken(TEXT_1, tag=TAG_1),
                 SimpleToken(TEXT_2, tag=TAG_2),
                 SimpleToken(TEXT_3, tag=TAG_3)
             ])
         ],
                    bounding_box=PAGE_BOUNDING_BOX))
     token_props_list = list(
         structured_document_to_token_props(structured_document))
     get_logger().debug('token_props_list:\n%s', token_props_list)
     X = [token_props_list_to_features(token_props_list)]
     y = [token_props_list_to_labels(token_props_list)]
     get_logger().debug('X:\n%s', X)
     get_logger().debug('y:\n%s', y)
     with create_crf_suite_model() as model:
         model.fit(X, y)
         y_predicted = model.predict(X)
         assert y_predicted == y
コード例 #9
0
    def test_should_pickle_and_unpickle_model(self):
        structured_document = SimpleStructuredDocument(
            SimplePage([
                SimpleLine([
                    SimpleToken(TEXT_1, tag=TAG_1),
                    SimpleToken(TEXT_2, tag=TAG_2),
                    SimpleToken(TEXT_3, tag=TAG_3)
                ])
            ],
                       bounding_box=PAGE_BOUNDING_BOX))
        token_props_list = list(
            structured_document_to_token_props(structured_document))
        X = [token_props_list_to_features(token_props_list)]
        y = [token_props_list_to_labels(token_props_list)]
        with create_crf_suite_model() as model:
            model.fit(X, y)
            serialized_model = pickle.dumps(model)

        model = pickle.loads(serialized_model)
        y_predicted = model.predict(X)
        assert y_predicted == y
コード例 #10
0
 def test_should_not_include_tag(self):
     result = token_props_list_to_features([
         create_token_props(TEXT_1, tag=TAG_1)
     ])
     assert [x.get('tag') for x in result] == [None]