Esempio n. 1
0
def test_selector_regex():
    document = Document.from_text("Hello World")

    results = document.select('hasTag() = false()')
    assert len(results) == 1

    results = document.select('hasTag()')
    assert len(results) == 0

    results = document.content_node.select('.')
    assert len(results) == 1
    assert results[0].content == "Hello World"

    results = document.content_node.select('*[contentRegex("Hello.*")]')
    assert len(results) == 1
    assert results[0].content == "Hello World"

    results2 = document.content_node.select('*[contentRegex("Cheese.*")]')
    assert len(results2) == 0

    results = document.content_node.select('*[content()="Hello World"]')
    assert len(results) == 1
    assert results[0].content == "Hello World"

    results2 = document.content_node.select('*[contentRegex("Cheese.*",true)]')
    assert len(results2) == 0
Esempio n. 2
0
def test_tag_regex():
    document = Document.from_text("Hello World")
    results = document.content_node.select('*[typeRegex("te.*")]')
    assert len(results) == 1
    assert results[0].content == "Hello World"
    results2 = document.content_node.select('*[typeRegex("chee.*")]')
    assert len(results2) == 0
Esempio n. 3
0
def test_basic_local_document_store():
    lds = LocalDocumentStore(store_path='/tmp/s1', force_initialize=True)
    lds.put('my-doc', Document.from_text('hello!'))

    assert len(lds.list_objects()) == 1

    lds2 = LocalDocumentStore(store_path='/tmp/s1')
    assert len(lds2.list_objects()) == 1
Esempio n. 4
0
def test_node_only_tagging():
    doc = Document.from_text("Hello World")

    doc.content_node.tag(node_only=True, content_re="Hello World", tag_to_apply="test")
    assert len(doc.content_node.get_tag_values("test")) == 1

    doc.content_node.tag(node_only=True, content_re="Hello Cheese", tag_to_apply="test2")
    assert len(doc.content_node.get_tag_values("test2")) == 0
Esempio n. 5
0
def test_tag_multiple_regex_matches():
    doc_string = "Mary had a little lamb, little lamb, little lamb.  Mary had a little lamb whose fleece was white as snow."

    document = Document.from_text(doc_string)
    pipeline = Pipeline(document)
    pipeline.add_step(NodeTagger(selector='//*', tag_to_apply='SIZE', content_re=r'(little)', node_only=False))
    context = pipeline.run()

    tags = context.output_document.get_root().get_all_tags()
    assert len(tags) == 1

    # we expect 4 tags to be applied, one for each instance of the word 'little'
    feature_values = context.output_document.get_root().get_feature_values('tag', 'SIZE')
    assert type(feature_values) == list and len(feature_values) == 4
    assert feature_values[2]['start'] == 37
    assert feature_values[2]['end'] == 43

    # Because we didn't pass in a tag_uuid to the NodeTagger, each of the feature values should have a different UUID
    features_uuids = list(set(dic['uuid'] for dic in feature_values))
    assert len(features_uuids) == 4

    # Run the multiple tag test again, but this time pass in a tag_uuid
    document = Document.from_text(doc_string)
    pipeline = Pipeline(document)
    pipeline.add_step(NodeTagger(selector='//*', tag_to_apply='SIZE', content_re=r'(little)', node_only=False,
                                 node_tag_uuid=str(uuid.uuid4())))
    context = pipeline.run()

    # Now each of the feature values should have the same UUID
    feature_values = context.output_document.get_root().get_feature_values('tag', 'SIZE')
    features_uuids = list(set(dic['uuid'] for dic in feature_values))
    assert len(features_uuids) == 1

    # Now test that tagging the entire node, rather than references within the node, only produce 1 feature
    document = Document.from_text(doc_string)
    pipeline = Pipeline(document)
    pipeline.add_step(NodeTagger(selector='//*', tag_to_apply='SIZE_2', content_re=r'.*(little).*', node_only=True))
    context = pipeline.run()

    tags = context.output_document.get_root().get_all_tags()
    assert len(tags) == 1

    # we expect one tag to be applied and there to be no start or end value
    feature_values = context.output_document.get_root().get_feature_value('tag', 'SIZE_2')
    assert feature_values['start'] is None and feature_values['end'] is None
Esempio n. 6
0
def test_fixed_tagging_remove():
    doc = Document.from_text("Hello Philip")
    doc.content_node.tag('name', fixed_position=[6, 12])

    assert doc.content_node.get_tag_values('name')[0] == 'Philip'

    doc.content_node.remove_tag('name')

    assert len(doc.content_node.get_tag_values('name')) == 0
Esempio n. 7
0
def test_fixed_tagging_with_child():
    doc = Document.from_text("Hello")
    doc.content_node.add_child_content("text", "Philip")
    doc.content_node.add_child_content("text", "Dodds")

    # Hello Philip Dodds
    # 012345678901234567

    assert doc.content_node.get_all_content(strip=False)[6:12] == 'Philip'
    assert doc.content_node.get_all_content(strip=False)[13:18] == 'Dodds'

    doc.content_node.tag('name', fixed_position=[6, 12], separator=" ")

    assert doc.content_node.get_tag_values('name', include_children=True)[0] == 'Philip'
    doc.content_node.tag('lastName', fixed_position=[13, 18], separator=" ")

    assert doc.content_node.get_tag_values('lastName', include_children=True)[0] == 'Dodds'
Esempio n. 8
0
def test_selector_operators():
    document = Document.from_text("Hello World")

    # combining multiple functions

    # Feeling crazy?
    assert len(document.content_node.select('//*[typeRegex("te.*") and contentRegex("H.*D")]')) == 0
    # no dice - handle your capitalization correctly! :-)

    assert len(document.content_node.select('//*[typeRegex("te.*") or contentRegex("H.*D")]')) == 1

    # This should obviously return zero nodes, as 'Howdy' isn't in the document
    assert len(document.content_node.select('//*[typeRegex("te.*") and contentRegex("Howdy")]')) == 0

    # What about this?  There's an H and a W...
    assert len(document.content_node.select('//*[typeRegex("te.*") and contentRegex("H*W")]')) == 0

    # Try that again, but modify the contentRegex
    assert len(document.content_node.select('//*[typeRegex("te.*") and contentRegex("H.*W")]')) == 1
    # yea!

    # Another variation - we expect success
    assert len(document.content_node.select('//*[typeRegex("te.*") and contentRegex("H.*d")]')) == 1
Esempio n. 9
0
def test_selector_2():
    document = Document.from_text("Hello World")
    results = document.content_node.select('*')
    assert len(results) == 1
    assert results[0].content == "Hello World"
Esempio n. 10
0
def test_tag_copy():
    doc_string = "Mary had a little lamb, little lamb, little lamb.  Mary had a little lamb whose fleece was white as snow."
    # data setup - creating a single tag with multiple matches...and then copying it
    document = Document.from_text(doc_string)
    pipeline = Pipeline(document)
    pipeline.add_step(NodeTagger(selector='//*', tag_to_apply='SIZE', content_re=r'(little)', node_only=False))
    context = pipeline.run()

    # both existing and new tag names must be provided, and they must be different, test for that first.
    for n in document.select('//*[hasTag("SIZE")]'):
        n.copy_tag(existing_tag_name=None, new_tag_name='NewTagNone')

    for n in document.select('//*[hasTag("SIZE")]'):
        n.copy_tag(existing_tag_name='SIZE', new_tag_name=None)

    for n in document.select('//*[hasTag("SIZE")]'):
        n.copy_tag(existing_tag_name='SIZE', new_tag_name='SIZE')

    # verify that the only tag that exists is tag 'SIZE' and that there are only 4 feature values for it
    assert len(document.get_root().get_all_tags()) == 1
    assert 'SIZE' in document.get_root().get_all_tags()

    # now, let's copy the SIZE tags and create new ones called LAMB_INFO
    # reusing the previously tagged document and testing out NodeTagCopy action
    pipeline = Pipeline(document)
    pipeline.add_step(NodeTagCopy(selector='//*[hasTag("SIZE")]', existing_tag_name='SIZE', new_tag_name='LAMB_INFO'))
    context = pipeline.run()

    # we should now have 4 feature values for 'LAMB_INFO' and 4 feature values for 'SIZE' - all with different UUIDs
    size_feature_values = context.output_document.get_root().get_feature_values('tag', 'SIZE')
    assert type(size_feature_values) == list and len(size_feature_values) == 4

    lamb_info_feature_values = context.output_document.get_root().get_feature_values('tag', 'LAMB_INFO')
    assert type(lamb_info_feature_values) == list and len(lamb_info_feature_values) == 4
    lamb_info_features_uuids = set(dic['uuid'] for dic in lamb_info_feature_values)
    assert len(list(lamb_info_features_uuids)) == 4

    # Now test that tagging the entire node, rather than references within the node, only produce 1 feature
    document = Document.from_text(doc_string)  # starting with a clean document
    pipeline = Pipeline(document)
    pipeline.add_step(NodeTagger(selector='//*', tag_to_apply='SIZE_2', content_re=r'.*(little).*', node_only=True))
    context = pipeline.run()

    # now, let's copy the SIZE_2 tags and create new ones called LAMB_INFO (using node's tag_copy)
    for n in document.select('//*[hasTag("SIZE_2")]'):
        n.copy_tag(existing_tag_name='SIZE_2', new_tag_name='LAMB_INFO_2')

    # we should now have 1 feature values for 'LAMB_INFO_2' and 1 feature values for 'SIZE_2'
    size_2_feature_values = context.output_document.get_root().get_feature_value('tag', 'SIZE_2')
    assert type(size_2_feature_values) != list
    lamb_info_2_feature_values = context.output_document.get_root().get_feature_value('tag', 'LAMB_INFO_2')
    assert type(lamb_info_2_feature_values) != list

    # now we need to test that when features are related (indicated by the same tag_uuid), they remain related when copying
    document = Document.from_text(doc_string)  # starting with a clean document
    pipeline = Pipeline(document)
    pipeline.add_step(
        NodeTagger(selector='//*', tag_to_apply='FLEECE_INFO', content_re=r'((white|snow))', node_only=False,
                   node_tag_uuid=str(uuid.uuid4())))
    context = pipeline.run()

    # now, let's copy the SIZE tags and create new ones called LAMB_INFO
    pipeline = Pipeline(document)  # reusing the previously tagged document & testing out the NodeTagCopy action
    pipeline.add_step(
        NodeTagCopy(selector='//*[hasTag("FLEECE_INFO")]', existing_tag_name='FLEECE_INFO', new_tag_name='WOOL_INFO'))
    context = pipeline.run()

    # The feature values should have the same UUID - for both WOOL_INFO and FLEECE_INFO
    wool_values = context.output_document.get_root().get_feature_values('tag', 'WOOL_INFO')
    assert type(wool_values) == list and len(wool_values) == 2
    wool_uuids = set(dic['uuid'] for dic in wool_values)
    assert len(list(wool_uuids)) == 1

    fleece_info_values = context.output_document.get_root().get_feature_values('tag', 'FLEECE_INFO')
    assert type(fleece_info_values) == list and len(fleece_info_values) == 2