def test_interesting_pipeline(): training_documents = LocalDocumentStore() # If we have a store, can we determine that we only want to process # documents that we haven't already got in the store def test_step(document): return document def test_model_store(document, context): context.get_store('my-model-store').put_native( 'cheese.txt', 'so cheesy'.encode('ascii')) return document training_prep = Pipeline.from_text("hello world", apply_lineage=False) training_prep.add_step(test_step) training_prep.add_label('training_document') training_prep.add_step(DocumentStoreWriter(training_documents)) training_prep.run() assert training_documents.count() == 1 model_store = LocalModelStore() training_pipeline = Pipeline.from_store(training_documents) training_pipeline.add_store('my-model-store', model_store) training_pipeline.add_step(test_model_store) training_pipeline.run() assert model_store.get_native('cheese.txt').read().decode( 'ascii') == 'so cheesy'
def get_test_pipeline(filename): pipeline = Pipeline( FolderConnector(path=str(get_test_directory()), file_filter=filename + '.txt')) pipeline.add_step(TextParser()) context = pipeline.run() # Make sure the finders are available document = context.output_document return document
def test_folder_connector_unpack_wildcard(): document_sink = LocalDocumentStore() pipeline = Pipeline( FolderConnector(path=str(get_test_directory()) + 'folder_unpack_test', file_filter='*.*', unpack=True)) pipeline.run() # let's make sure we properly unpacked each document and have all ContentNodes for document_family in document_sink.query_families(): doc = document_sink.get_latest_document_in_family(document_family) if doc.get_root().get_all_content().find('HSBC') > -1: assert len(doc.select("//*")) == 39 elif doc.get_root().get_all_content().find('flea') > -1: assert len(doc.select("//*")) == 6
def test_function_step_to_yaml(): pipeline = Pipeline.from_file('test') def do_it(document): print("hello") return document pipeline.add_step(do_it) print(pipeline.to_yaml())
def test_lines_of_text(): # first test with all content being placed on root ContentNode pipeline = Pipeline.from_file(get_test_directory() + 'multiline_text.txt') pipeline.add_step(TextParser) context = pipeline.run() doc = context.output_document assert len(doc.get_root().get_children()) == 0 assert len(doc.get_root().get_all_content()) > 0 # next, test with all content being placed the root's children pipeline = Pipeline.from_file(get_test_directory() + 'multiline_text.txt') pipeline.add_step(TextParser(lines_as_child_nodes=True)) context = pipeline.run() doc = context.output_document assert len(doc.get_root().get_children()) > 0 assert doc.get_root().get_content() is None
def test_kodexa_service(): document_sink = InMemoryDocumentSink() pipeline = Pipeline(FolderConnector(path=str(get_test_directory()), file_filter='*.pdf')) pipeline.add_step(KodexaCloudService(slug='kodexa/pdf-parse', attach_source=True)) pipeline.set_sink(document_sink) pipeline.run() # Make sure the finders are available document = document_sink.get_document(0) assert document print(document.to_json())
def get_test_pipeline(filename): document_sink = InMemoryDocumentSink() pipeline = Pipeline( FolderConnector(path=str(get_test_directory()), file_filter=filename + '.txt')) pipeline.add_step(TextParser(decode=True)) pipeline.set_sink(document_sink) pipeline.run() # Make sure the finders are available document = document_sink.get_document(0) registry.add_mixin_to_document("core", document) return document
def test_spatial_doc_sample_two(): # This test document and this portion of code is a snippet # from a test in the spatial actions tests. Adding this saved doc # and this section to ensure NodeTagger is tested. page_footer_re = r'Page \d+ of \d+$' document = Document.from_kdxa(get_test_directory() + 'before_fail.kdxa') pipeline = Pipeline(document) pipeline.add_step( NodeTagger(selector='//*[typeRegex("line.*")]', content_re=page_footer_re, tag_to_apply='page_footer')) pipeline.run() doc = pipeline.context.output_document assert doc.get_root() is not None
def test_predefined_table_store(): def process(document, context): if context.get_store('prediction-data-store'): document.get_root().content = 'We have a data store name' elif context.get_store_names() and len(context.get_store_names()) > 0: document.get_root().content = ' '.join(context.get_store_names()) else: document.get_root().content = 'No stores on context' return document pipeline = Pipeline.from_text("Hello World") pipeline.add_store('prediction-data-store', TableDataStore()) pipeline.add_step(process) context = pipeline.run() new_doc = context.output_document print(new_doc.content_node.content) assert new_doc.content_node.content == 'We have a data store name'
def test_table_data_store(): # Testing with 'include_node_content' set to True. Should result in 3 columns pipeline = Pipeline( Document.from_kdxa( os.path.join(get_test_directory(), 'tongue_twister.kdxa'))) pipeline.add_step( NodeTagger(selector='//*[contentRegex(".*flue.*")]', tag_to_apply='has_flue', node_only=True, node_tag_uuid='test')) pipeline.add_step( TagsToKeyValuePairExtractor(store_name='tagged_data', include_node_content=True)) context = pipeline.run() compare_store(context, 'tagged_data', 'basic_store_tagged_data1.json') # Testing with 'include_node_content' set to False. Should result in 2 columns pipeline2 = Pipeline( Document.from_kdxa( os.path.join(get_test_directory(), 'tongue_twister.kdxa'))) pipeline2.add_step( NodeTagger(selector='//*[contentRegex(".*flue.*")]', tag_to_apply='has_flue', node_only=True)) pipeline2.add_step( TagsToKeyValuePairExtractor(store_name='tagged_data_2', include_node_content=False)) context2 = pipeline2.run() compare_store(context2, 'tagged_data_2', 'basic_store_tagged_data2.json')
def test_to_yaml(): # Create the pipeline pipeline = Pipeline.from_file('examples/USBankSample.pdf') pipeline.add_step( RemoteStep(ref='kodexa/pdf-parser', options={ "layout_analysis_options": { "rollup": "word", "space_multiplier": 1 }, "analyze_layout": True }, attach_source=True)) col_space_multiplier = 3.0 page_number_re = ".*Page \d+ of \d+$" transactions_header_re = '^Date\s+Description.*\s+Amount$' continued_re = '^.*\(continued\)$' # Extract Other Deposits other_deposits_table_tag_name = "Other Deposits" other_deposits_re = '^Other Deposits$' total_other_deposits_re = '^Total Other Deposits.*\d{2}$' balance_re = '^BALANCE YOUR ACCOUNT$' pipeline.add_step( RemoteStep(ref='kodexa/pattern-table-tagger', options={ "col_space_multiplier": col_space_multiplier, "tag_to_apply": other_deposits_table_tag_name, "page_start_re": other_deposits_re, "page_end_re": total_other_deposits_re, "table_start_re": transactions_header_re, "table_end_re": balance_re, "col_marker_re": transactions_header_re, "extract": True, "extract_options": { 'store_name': other_deposits_table_tag_name, 'header_lines_count': 1, 'first_col_has_text': True } })) # Extract Card Withdrawals card_withdrawals_table_tag_name = "Card Withdrawals" card_withdrawals_re = '^Card Withdrawals$' subtotal_card_withdrawals_re = '^Card \d{4} Withdrawals Subtotal.*\d{2}.$' total_card_withdrawals_re = '^Total Card Withdrawals.*\d{2}.$' pipeline.add_step( RemoteStep(ref='kodexa/pattern-table-tagger', options={ "col_space_multiplier": col_space_multiplier, "tag_to_apply": card_withdrawals_table_tag_name, "page_start_re": card_withdrawals_re, "page_end_re": total_card_withdrawals_re, "table_start_re": transactions_header_re, "table_end_re": subtotal_card_withdrawals_re, "col_marker_re": transactions_header_re, "extract": True, "extract_options": { 'store_name': card_withdrawals_table_tag_name, 'header_lines_count': 1, 'first_col_has_text': True } })) # Extract Other Withdrawals other_withdrawals_table_tag_name = "Other Withdrawals" other_withdrawals_re = '^Other Withdrawals$' total_other_withdrawals_re = '^Total Other Withdrawals.*\d{2}.$' pipeline.add_step( RemoteStep(ref='kodexa/pattern-table-tagger', options={ "col_space_multiplier": col_space_multiplier, "tag_to_apply": other_withdrawals_table_tag_name, "page_start_re": other_withdrawals_re, "page_end_re": total_other_withdrawals_re, "table_start_re": transactions_header_re, "table_end_re": '', "col_marker_re": transactions_header_re, "extract": True, "extract_options": { 'store_name': other_withdrawals_table_tag_name, 'header_lines_count': 1, 'first_col_has_text': True } })) # Extract Checks checks_table_tag_name = "Checks" check_transactions_re = '^Check Date .* Ref Number Amount$' checks_re = '^Checks Presented Conventionally$' checks_paid_re = '.*Conventional Checks Paid.*\d{2}.$' pipeline.add_step( RemoteStep(ref='kodexa/pattern-table-tagger', options={ "col_space_multiplier": col_space_multiplier, "tag_to_apply": checks_table_tag_name, "page_start_re": checks_re, "page_end_re": checks_paid_re, "table_start_re": check_transactions_re, "table_end_re": '', "col_marker_re": check_transactions_re, "extract": True, "extract_options": { 'store_name': checks_table_tag_name, 'header_lines_count': 1, 'first_col_has_text': True, 'tables_in_page_count': 2 } })) print(pipeline.to_yaml())
def test_rollup_of_pdf(): # first test - collapsing words and lines up to their common parent test_doc = Document.from_kdxa(get_test_directory() + '20200709.kdxa') # how many pre-rollup lines? assert len(test_doc.select('//line')) == 3824 # how many pre-rollup words? assert len(test_doc.select('//word')) == 52903 # how many pre-rollup content-areas? assert len(test_doc.select('//content-area')) == 817 # what is the pre-rollup length of ALL the content in the document? assert len(test_doc.get_root().get_all_content()) == 329792 rollup_pipeline = Pipeline(test_doc) rollup_pipeline.add_step( RollupTransformer(collapse_type_res=["word", "line"], separator_character=' ')) rollup_pipeline.run() collapsed_doc = rollup_pipeline.context.output_document # how many post-rollup lines? assert len(test_doc.select('//line')) == 0 # how many post-rollup words? assert len(test_doc.select('//word')) == 0 # how many post-rollup content-areas? assert len(test_doc.select('//content-area')) == 817 # what is the post-rollup length of ALL the content in the document? assert len(test_doc.get_root().get_all_content()) == 329792 assert len( collapsed_doc.select("//content-area")[12].get_all_content()) == 235 # second test - just collapse the line up to its parent (content-area) - roll up the line's children test_doc = Document.from_kdxa(get_test_directory() + '20200709.kdxa') rollup_pipeline = Pipeline(test_doc) rollup_pipeline.add_step( RollupTransformer(collapse_type_res=["line"], separator_character=' ', get_all_content=True)) rollup_pipeline.run() collapsed_doc = rollup_pipeline.context.output_document # how many post-rollup lines? assert len(test_doc.select('//line')) == 0 # how many post-rollup words? assert len(test_doc.select('//word')) == 0 # how many post-rollup content-areas? assert len(test_doc.select('//content-area')) == 817 # what is the post-rollup length of ALL the content in the document? assert len(test_doc.get_root().get_all_content()) == 329792 # verify that we can collapse line nodes AND include their children assert len( collapsed_doc.select("//content-area")[12].get_all_content()) == 235 # third test - select specific nodes in which we'll do the roll ups test_doc = Document.from_kdxa(get_test_directory() + '20200709.kdxa') node_selector = "//content-area[contentRegex('.*LOAN AGREEMENT.*', true)]" # verify we have 3 nodes match this selector node_matches = test_doc.select(node_selector) assert len(node_matches) == 3 # before we rollup, let's make sure the matching nodes conform to known expectations assert len(node_matches[0].select('//word')) == 2 assert len(node_matches[0].select('//line')) == 1 assert len(node_matches[0].select('//content-area')) == 1 assert len(node_matches[0].get_all_content()) == 14 assert len(node_matches[1].select('//word')) == 2 assert len(node_matches[1].select('//line')) == 1 assert len(node_matches[1].select('//content-area')) == 1 assert len(node_matches[1].get_all_content()) == 14 assert len(node_matches[2].select('//word')) == 71 assert len(node_matches[2].select('//line')) == 6 assert len(node_matches[2].select('//content-area')) == 1 assert len(node_matches[2].get_all_content()) == 500 rollup_pipeline = Pipeline(test_doc) rollup_pipeline.add_step( RollupTransformer( selector="//content-area[contentRegex('.*LOAN AGREEMENT.*', true)]", collapse_type_res=["line"], separator_character=' ', get_all_content=True)) rollup_pipeline.run() collapsed_doc = rollup_pipeline.context.output_document # check those matching nodes - we shouldn't have any words or lines, but # all other node_types should exist and the content should stay the same. assert len(node_matches[0].select('//word')) == 0 assert len(node_matches[0].select('//line')) == 0 assert len(node_matches[0].select('//content-area')) == 1 assert len(node_matches[0].get_all_content()) == 14 assert len(node_matches[1].select('//word')) == 0 assert len(node_matches[1].select('//line')) == 0 assert len(node_matches[1].select('//content-area')) == 1 assert len(node_matches[1].get_all_content()) == 14 assert len(node_matches[2].select('//word')) == 0 assert len(node_matches[2].select('//line')) == 0 assert len(node_matches[2].select('//content-area')) == 1 assert len(node_matches[2].get_all_content()) == 500 # how many post-rollup lines? (still have some lines, but fewer than we started with) assert len(test_doc.select('//line')) == 3816 # how many post-rollup words? (still have some words, but fewer than we started with) assert len(test_doc.select('//word')) == 52828 # how many post-rollup content-areas? (same number of content-areas) assert len(test_doc.select('//content-area')) == 817 # what is the post-rollup length of ALL the content in the document? assert len(test_doc.get_root().get_all_content()) == 329792 # verify that we can collapse line nodes AND include their children assert len( collapsed_doc.select("//content-area")[12].get_all_content()) == 235
def test_tag_multiple_regex_matches(): doc_string = "Mary had a little lamb, little lamb, little lamb. Mary had a little lamb whose fleece was white as snow." document = Document.from_text(doc_string) pipeline = Pipeline(document) pipeline.add_step(NodeTagger(selector='//*', tag_to_apply='SIZE', content_re=r'(little)', node_only=False)) context = pipeline.run() tags = context.output_document.get_root().get_all_tags() assert len(tags) == 1 # we expect 4 tags to be applied, one for each instance of the word 'little' feature_values = context.output_document.get_root().get_feature_values('tag', 'SIZE') assert type(feature_values) == list and len(feature_values) == 4 assert feature_values[2]['start'] == 37 assert feature_values[2]['end'] == 43 # Because we didn't pass in a tag_uuid to the NodeTagger, each of the feature values should have a different UUID features_uuids = list(set(dic['uuid'] for dic in feature_values)) assert len(features_uuids) == 4 # Run the multiple tag test again, but this time pass in a tag_uuid document = Document.from_text(doc_string) pipeline = Pipeline(document) pipeline.add_step(NodeTagger(selector='//*', tag_to_apply='SIZE', content_re=r'(little)', node_only=False, node_tag_uuid=str(uuid.uuid4()))) context = pipeline.run() # Now each of the feature values should have the same UUID feature_values = context.output_document.get_root().get_feature_values('tag', 'SIZE') features_uuids = list(set(dic['uuid'] for dic in feature_values)) assert len(features_uuids) == 1 # Now test that tagging the entire node, rather than references within the node, only produce 1 feature document = Document.from_text(doc_string) pipeline = Pipeline(document) pipeline.add_step(NodeTagger(selector='//*', tag_to_apply='SIZE_2', content_re=r'.*(little).*', node_only=True)) context = pipeline.run() tags = context.output_document.get_root().get_all_tags() assert len(tags) == 1 # we expect one tag to be applied and there to be no start or end value feature_values = context.output_document.get_root().get_feature_value('tag', 'SIZE_2') assert feature_values['start'] is None and feature_values['end'] is None
def test_tag_copy(): doc_string = "Mary had a little lamb, little lamb, little lamb. Mary had a little lamb whose fleece was white as snow." # data setup - creating a single tag with multiple matches...and then copying it document = Document.from_text(doc_string) pipeline = Pipeline(document) pipeline.add_step(NodeTagger(selector='//*', tag_to_apply='SIZE', content_re=r'(little)', node_only=False)) context = pipeline.run() # both existing and new tag names must be provided, and they must be different, test for that first. for n in document.select('//*[hasTag("SIZE")]'): n.copy_tag(existing_tag_name=None, new_tag_name='NewTagNone') for n in document.select('//*[hasTag("SIZE")]'): n.copy_tag(existing_tag_name='SIZE', new_tag_name=None) for n in document.select('//*[hasTag("SIZE")]'): n.copy_tag(existing_tag_name='SIZE', new_tag_name='SIZE') # verify that the only tag that exists is tag 'SIZE' and that there are only 4 feature values for it assert len(document.get_root().get_all_tags()) == 1 assert 'SIZE' in document.get_root().get_all_tags() # now, let's copy the SIZE tags and create new ones called LAMB_INFO # reusing the previously tagged document and testing out NodeTagCopy action pipeline = Pipeline(document) pipeline.add_step(NodeTagCopy(selector='//*[hasTag("SIZE")]', existing_tag_name='SIZE', new_tag_name='LAMB_INFO')) context = pipeline.run() # we should now have 4 feature values for 'LAMB_INFO' and 4 feature values for 'SIZE' - all with different UUIDs size_feature_values = context.output_document.get_root().get_feature_values('tag', 'SIZE') assert type(size_feature_values) == list and len(size_feature_values) == 4 lamb_info_feature_values = context.output_document.get_root().get_feature_values('tag', 'LAMB_INFO') assert type(lamb_info_feature_values) == list and len(lamb_info_feature_values) == 4 lamb_info_features_uuids = set(dic['uuid'] for dic in lamb_info_feature_values) assert len(list(lamb_info_features_uuids)) == 4 # Now test that tagging the entire node, rather than references within the node, only produce 1 feature document = Document.from_text(doc_string) # starting with a clean document pipeline = Pipeline(document) pipeline.add_step(NodeTagger(selector='//*', tag_to_apply='SIZE_2', content_re=r'.*(little).*', node_only=True)) context = pipeline.run() # now, let's copy the SIZE_2 tags and create new ones called LAMB_INFO (using node's tag_copy) for n in document.select('//*[hasTag("SIZE_2")]'): n.copy_tag(existing_tag_name='SIZE_2', new_tag_name='LAMB_INFO_2') # we should now have 1 feature values for 'LAMB_INFO_2' and 1 feature values for 'SIZE_2' size_2_feature_values = context.output_document.get_root().get_feature_value('tag', 'SIZE_2') assert type(size_2_feature_values) != list lamb_info_2_feature_values = context.output_document.get_root().get_feature_value('tag', 'LAMB_INFO_2') assert type(lamb_info_2_feature_values) != list # now we need to test that when features are related (indicated by the same tag_uuid), they remain related when copying document = Document.from_text(doc_string) # starting with a clean document pipeline = Pipeline(document) pipeline.add_step( NodeTagger(selector='//*', tag_to_apply='FLEECE_INFO', content_re=r'((white|snow))', node_only=False, node_tag_uuid=str(uuid.uuid4()))) context = pipeline.run() # now, let's copy the SIZE tags and create new ones called LAMB_INFO pipeline = Pipeline(document) # reusing the previously tagged document & testing out the NodeTagCopy action pipeline.add_step( NodeTagCopy(selector='//*[hasTag("FLEECE_INFO")]', existing_tag_name='FLEECE_INFO', new_tag_name='WOOL_INFO')) context = pipeline.run() # The feature values should have the same UUID - for both WOOL_INFO and FLEECE_INFO wool_values = context.output_document.get_root().get_feature_values('tag', 'WOOL_INFO') assert type(wool_values) == list and len(wool_values) == 2 wool_uuids = set(dic['uuid'] for dic in wool_values) assert len(list(wool_uuids)) == 1 fleece_info_values = context.output_document.get_root().get_feature_values('tag', 'FLEECE_INFO') assert type(fleece_info_values) == list and len(fleece_info_values) == 2