Example #1
0
def test_table_data_store():
    # Testing with 'include_node_content' set to True.  Should result in 3 columns
    pipeline = Pipeline(
        Document.from_kdxa(
            os.path.join(get_test_directory(), 'tongue_twister.kdxa')))
    pipeline.add_step(
        NodeTagger(selector='//*[contentRegex(".*flue.*")]',
                   tag_to_apply='has_flue',
                   node_only=True,
                   node_tag_uuid='test'))
    pipeline.add_step(
        TagsToKeyValuePairExtractor(store_name='tagged_data',
                                    include_node_content=True))
    context = pipeline.run()

    compare_store(context, 'tagged_data', 'basic_store_tagged_data1.json')

    # Testing with 'include_node_content' set to False.  Should result in 2 columns
    pipeline2 = Pipeline(
        Document.from_kdxa(
            os.path.join(get_test_directory(), 'tongue_twister.kdxa')))
    pipeline2.add_step(
        NodeTagger(selector='//*[contentRegex(".*flue.*")]',
                   tag_to_apply='has_flue',
                   node_only=True))
    pipeline2.add_step(
        TagsToKeyValuePairExtractor(store_name='tagged_data_2',
                                    include_node_content=False))
    context2 = pipeline2.run()

    compare_store(context2, 'tagged_data_2', 'basic_store_tagged_data2.json')
Example #2
0
def test_tagging_issue_with_html():
    kdxa_doc = Document.from_kdxa(get_test_directory() + 'tagging_issue.kdxa')

    all_content = kdxa_doc.content_node.get_all_content(strip=False)
    assert "IIJ" == all_content[707:710]

    # Now we tag the same location and try and get the content from the tag
    kdxa_doc.content_node.tag("test_tag", use_all_content=True, node_only=False, fixed_position=(707, 710))

    node = kdxa_doc.select('//*[hasTag("test_tag")]')[0]
    feature = node.get_feature_value("tag", "test_tag")
    assert feature['value'] == 'IIJ'
    assert "IIJ" == kdxa_doc.select("//*[hasTag('test_tag')]")[0].get_all_content(strip=False)[
                    feature['start']:feature['end']]
Example #3
0
def test_spatial_doc_sample_two():
    # This test document and this portion of code is a snippet
    # from a test in the spatial actions tests.  Adding this saved doc
    # and this section to ensure NodeTagger is tested.
    page_footer_re = r'Page \d+ of \d+$'
    document = Document.from_kdxa(get_test_directory() + 'before_fail.kdxa')
    pipeline = Pipeline(document)

    pipeline.add_step(
        NodeTagger(selector='//*[typeRegex("line.*")]', content_re=page_footer_re, tag_to_apply='page_footer'))
    pipeline.run()

    doc = pipeline.context.output_document

    assert doc.get_root() is not None
Example #4
0
def test_rollup_of_pdf():
    # first test - collapsing words and lines up to their common parent
    test_doc = Document.from_kdxa(get_test_directory() + '20200709.kdxa')

    # how many pre-rollup lines?
    assert len(test_doc.select('//line')) == 3824
    # how many pre-rollup words?
    assert len(test_doc.select('//word')) == 52903
    # how many pre-rollup content-areas?
    assert len(test_doc.select('//content-area')) == 817
    # what is the pre-rollup length of ALL the content in the document?
    assert len(test_doc.get_root().get_all_content()) == 329792

    rollup_pipeline = Pipeline(test_doc)
    rollup_pipeline.add_step(
        RollupTransformer(collapse_type_res=["word", "line"],
                          separator_character=' '))
    rollup_pipeline.run()

    collapsed_doc = rollup_pipeline.context.output_document

    # how many post-rollup lines?
    assert len(test_doc.select('//line')) == 0
    # how many post-rollup words?
    assert len(test_doc.select('//word')) == 0
    # how many post-rollup content-areas?
    assert len(test_doc.select('//content-area')) == 817
    # what is the post-rollup length of ALL the content in the document?
    assert len(test_doc.get_root().get_all_content()) == 329792

    assert len(
        collapsed_doc.select("//content-area")[12].get_all_content()) == 235

    # second test - just collapse the line up to its parent (content-area) - roll up the line's children
    test_doc = Document.from_kdxa(get_test_directory() + '20200709.kdxa')

    rollup_pipeline = Pipeline(test_doc)
    rollup_pipeline.add_step(
        RollupTransformer(collapse_type_res=["line"],
                          separator_character=' ',
                          get_all_content=True))
    rollup_pipeline.run()

    collapsed_doc = rollup_pipeline.context.output_document

    # how many post-rollup lines?
    assert len(test_doc.select('//line')) == 0
    # how many post-rollup words?
    assert len(test_doc.select('//word')) == 0
    # how many post-rollup content-areas?
    assert len(test_doc.select('//content-area')) == 817
    # what is the post-rollup length of ALL the content in the document?
    assert len(test_doc.get_root().get_all_content()) == 329792

    # verify that we can collapse line nodes AND include their children
    assert len(
        collapsed_doc.select("//content-area")[12].get_all_content()) == 235

    # third test - select specific nodes in which we'll do the roll ups
    test_doc = Document.from_kdxa(get_test_directory() + '20200709.kdxa')

    node_selector = "//content-area[contentRegex('.*LOAN AGREEMENT.*', true)]"

    # verify we have 3 nodes match this selector
    node_matches = test_doc.select(node_selector)
    assert len(node_matches) == 3

    # before we rollup, let's make sure the matching nodes conform to known expectations
    assert len(node_matches[0].select('//word')) == 2
    assert len(node_matches[0].select('//line')) == 1
    assert len(node_matches[0].select('//content-area')) == 1
    assert len(node_matches[0].get_all_content()) == 14

    assert len(node_matches[1].select('//word')) == 2
    assert len(node_matches[1].select('//line')) == 1
    assert len(node_matches[1].select('//content-area')) == 1
    assert len(node_matches[1].get_all_content()) == 14

    assert len(node_matches[2].select('//word')) == 71
    assert len(node_matches[2].select('//line')) == 6
    assert len(node_matches[2].select('//content-area')) == 1
    assert len(node_matches[2].get_all_content()) == 500

    rollup_pipeline = Pipeline(test_doc)
    rollup_pipeline.add_step(
        RollupTransformer(
            selector="//content-area[contentRegex('.*LOAN AGREEMENT.*', true)]",
            collapse_type_res=["line"],
            separator_character=' ',
            get_all_content=True))
    rollup_pipeline.run()

    collapsed_doc = rollup_pipeline.context.output_document

    # check those matching nodes - we shouldn't have any words or lines, but
    # all other node_types should exist and the content should stay the same.
    assert len(node_matches[0].select('//word')) == 0
    assert len(node_matches[0].select('//line')) == 0
    assert len(node_matches[0].select('//content-area')) == 1
    assert len(node_matches[0].get_all_content()) == 14

    assert len(node_matches[1].select('//word')) == 0
    assert len(node_matches[1].select('//line')) == 0
    assert len(node_matches[1].select('//content-area')) == 1
    assert len(node_matches[1].get_all_content()) == 14

    assert len(node_matches[2].select('//word')) == 0
    assert len(node_matches[2].select('//line')) == 0
    assert len(node_matches[2].select('//content-area')) == 1
    assert len(node_matches[2].get_all_content()) == 500

    # how many post-rollup lines? (still have some lines, but fewer than we started with)
    assert len(test_doc.select('//line')) == 3816
    # how many post-rollup words? (still have some words, but fewer than we started with)
    assert len(test_doc.select('//word')) == 52828
    # how many post-rollup content-areas? (same number of content-areas)
    assert len(test_doc.select('//content-area')) == 817
    # what is the post-rollup length of ALL the content in the document?
    assert len(test_doc.get_root().get_all_content()) == 329792

    # verify that we can collapse line nodes AND include their children
    assert len(
        collapsed_doc.select("//content-area")[12].get_all_content()) == 235
Example #5
0
def test_parent_child():
    document = Document.from_kdxa(get_test_directory() + 'before_fail.kdxa')
    page = document.select('//page')[0]
    assert page.select('//line')[0].select_first('parent::page').uuid == page.uuid
Example #6
0
def test_selector_deep():
    document = Document.from_kdxa(get_test_directory() + 'before_fail.kdxa')
    assert len(document.select('//page')[0].select('//line')) == 63
    assert len(document.select('//line')) == 3143
Example #7
0
def test_fax2tagging():
    kdxa_doc = Document.from_kdxa(get_test_directory() + 'fax2.kdxa')

    kdxa_doc.content_node.tag("phone", use_all_content=True, fixed_position=[146, 158])
    assert kdxa_doc.select("//*[hasTag('phone')]")[0].content == '785-368-1772'
    assert kdxa_doc.select("//*[hasTag('phone')]")[0].get_feature_value("tag", "phone")['value'] == '785-368-1772'