def test_uuid_select(): document = Document.from_msgpack(open(os.path.join(get_test_directory(), 'news-tagged.kdxa'), 'rb').read()) node_uuid = document.select_first('//p').uuid print(document.select_first('//p').uuid) print(document.select_first('//p').content) assert document.select_first(f'//p[uuid({node_uuid})]').content == document.select_first('//p').content
def test_html_rollup(): document = Document.from_msgpack( open(os.path.join(get_test_directory(), 'news.kdxa'), 'rb').read()) # before rollup assert document.select('//a')[0].content == 'HSBC' assert document.select('//a')[1].content == 'Hang Seng Index' assert len( document.select('//*[contentRegex(".*Hang Seng Index.*")]') [0].get_content_parts()) == 1 # Collapse out all the <a> tags step = RollupTransformer(collapse_type_res=["a"]) step.process(document) # after rollup assert len(document.select('//a')) == 0 # see where the href rolled up assert document.select( '//*[contentRegex(".*Hang Seng Index.*")]' )[0].get_all_content( ) == 'The London-headquartered bank is a heavyweight component of the Hang Seng Index . HSBC shares in Hong Kong closed 2.78% lower.' assert len( document.select('//*[contentRegex(".*Hang Seng Index.*")]') [0].get_content_parts()) == 3
def test_tagged_content(): document = Document.from_msgpack(open(os.path.join(get_test_directory(), 'news-tagged.kdxa'), 'rb').read()) all_nodes = document.content_node.select('//*[hasTag($entityName)]', {"entityName": "ORG"}) assert len(all_nodes) == 9 all_nodes = document.content_node.select('//p stream *[hasTag("ORG")] stream *[hasTag("ORG")]') assert len(all_nodes) == 7 all_nodes = document.content_node.select('//p intersect //*[hasTag("ORG")]') assert len(all_nodes) == 7 # Has any tag to start tagged_nodes = document.content_node.select('//*[hasTag()]') assert len(tagged_nodes) == 22 feature_nodes = document.content_node.select('//*[hasFeature()]') assert len(feature_nodes) == 32 all_nodes = document.content_node.select('//*[hasTag("ORG")]') assert len(all_nodes) == 9 union_nodes = document.content_node.select('//*[hasTag("ORG")] | //*[hasTag("ORG")]') assert len(union_nodes) == 18 node_match = all_nodes[0].select('*[tagRegex("O.*")]') assert len(node_match) == 1 node_match2 = all_nodes[0].select('*[tagRegex("CHE.*")]') assert len(node_match2) == 0
def test_tag_key_value_include_exclude(): # Testing include parameter include_tags = ['DATE', 'LOC'] document = Document.from_msgpack( open(os.path.join(get_test_directory(), 'news-tagged.kdxa'), 'rb').read()) step = TagsToKeyValuePairExtractor(store_name='test_store', include=include_tags) context = PipelineContext() step.process(document, context) assert context.get_store('test_store').count() == 11 # Testing exclude parameter exclude_tags = ['DATE', 'LOC'] document = Document.from_msgpack( open(os.path.join(get_test_directory(), 'news-tagged.kdxa'), 'rb').read()) step = TagsToKeyValuePairExtractor(store_name='test_store', exclude=exclude_tags) context = PipelineContext() step.process(document, context) assert context.get_store('test_store').count() == 34 # Testing both include and exclude parameters include_tags = ['LOC'] exclude_tags = ['DATE'] document = Document.from_msgpack( open(os.path.join(get_test_directory(), 'news-tagged.kdxa'), 'rb').read()) step = TagsToKeyValuePairExtractor(store_name='test_store', include=include_tags, exclude=exclude_tags) context = PipelineContext() step.process(document, context) assert context.get_store('test_store').count() == 5 # Testing both include - this should be the same as before as 'exclude' shouldn't have really done anything include_tags = ['LOC'] document = Document.from_msgpack( open(os.path.join(get_test_directory(), 'news-tagged.kdxa'), 'rb').read()) step = TagsToKeyValuePairExtractor(store_name='test_store', include=include_tags) context = PipelineContext() step.process(document, context) assert context.get_store('test_store').count() == 5
def test_html_rollup(): document = Document.from_msgpack( open(os.path.join(get_test_directory(), 'news.mdoc'), 'rb').read()) # Collapse out all the <a> tags step = Rollup(collapse_type_res=["a"]) result = step.process(document) print(DocumentRender(result).to_text())
def test_instance_indexes(): document = Document.from_msgpack(open(os.path.join(get_test_directory(), 'news-tagged.kdxa'), 'rb').read()) first_paragraph = document.select('(//p)[0]') assert len(first_paragraph) == 1 # Note this is important - the index here is not the position in the results # but the index of the node itself first_paragraph = document.select('//p[0]') assert len(first_paragraph) == 18
def test_parent_axis(): document = Document.from_msgpack(open(os.path.join(get_test_directory(), 'news-tagged.kdxa'), 'rb').read()) first_paragraph = document.select('(//p)[0]') assert len(first_paragraph) == 1 assert len(first_paragraph[0].select('parent::div')) == 1 assert first_paragraph[0].select('parent::div')[0].node_type == 'div' link = document.select('//a')[0] assert link.select('parent::div')[0].node_type == 'div'
def test_tag_key_value(): document = Document.from_msgpack( open(os.path.join(get_test_directory(), 'news-tagged.mdoc'), 'rb').read()) # Collapse out all the <a> tags step = ExtractTagsToKeyValuePair(store_name='test_store') context = PipelineContext() result = step.process(document, context) print(context.get_store('test_store').rows)
def test_tag_key_value(): document = Document.from_msgpack( open(os.path.join(get_test_directory(), 'news-tagged.kdxa'), 'rb').read()) step = TagsToKeyValuePairExtractor(store_name='test_store') context = PipelineContext() step.process(document, context) assert context.get_store('test_store').count() == 45 assert context.get_store('test_store').rows[14][0] == 'LOC' assert context.get_store('test_store').rows[14][1] == 'Europe'
def test_selector_complex_doc_1(): document = Document.from_msgpack(open(os.path.join(get_test_directory(), 'news.kdxa'), 'rb').read()) all_nodes = document.content_node.select('//*') assert len(all_nodes) == 39 all_ps = document.content_node.select('//p') assert len(all_ps) == 18 for pos in range(18): selected_p = document.content_node.select(f'(//p)[{pos}]') assert len(selected_p) == 1 assert selected_p[0].uuid == all_ps[pos].uuid
def get_output_document(self, execution): final_reference = None for document_reference in execution.documentReferences: if document_reference.referenceType == 'OUTPUT': final_reference = document_reference if final_reference: doc = requests.get( f"{self.cloud_url}/api/sessions/{self.cloud_session.id}/executions/{execution.id}/documents/{final_reference.cloudDocument.id}", headers={"x-access-token": self.access_token}) return Document.from_msgpack(doc.content) else: return None