Esempio n. 1
def test_annotate_highlight():
    """Test annotate with manipulated config and with mode highlight.

    Take two paragraphs and create a highlighted item for one item
    per paragraph of elements.

    cfg = get_config()
    highlight = {
        'mode': 'highlight',
        'text_unit': {
            'items_per_unit': 1,
            'key': 'p',
            'name': 'html-paragraph'
    two_paragraphs = DATA['test_paragraphs']['content'].encode('utf-8')
    highlight_elements = DATA['elements']
    annotated = annotate(two_paragraphs, highlight_elements, config=cfg)

    tpopa = DATA['test_paragraphs']['results']['one_per_unit_annotated'].encode('utf-8')
    assert annotated == tpopa

    success, cleared_text = clean(annotated, config=cfg)
    assert success
    assert two_paragraphs == cleared_text
def test_markup_unknown():
    """Test annotate with key in settings but not in markup."""

    cfg = get_config()
    cfg['settings'].update({'mode': 'hocuspocus'})

    with pytest.raises(KeyError):
        _ = annotate('', DATA['elements'], config=cfg)
def test_annotate_own_validator_from_outside():
    """Test annotate with an own validator."""

    cfg = get_config()
    cfg['settings'].update({'mode': 'highlight'})
    two_paragraphs = DATA['test_paragraphs']['content'].encode('utf-8')
    highlight_elements = DATA['elements']

    annotated = annotate(two_paragraphs, highlight_elements,
                         own_validator=[my_validator], config=cfg)

    expected_result = DATA['test_paragraphs']['results']['LA_annotated'].encode('utf-8')
    assert annotated == expected_result
Esempio n. 4
def test_annotation_rules():
    """Test annotate elements with default and manipulated config."""

    RLINKS = [
        {"A":   {"type": "letterA", "score": 42}},
        {"AA":  {"type": "letterA", "score": 42}},
        {"AAA": {"type": "letterA", "score": 42}},
        {"B":   {"type": "letterB", "score": 42}},
        {"BB":  {"type": "letterB", "score": 42}},
        {"BBB": {"type": "letterB", "score": 42}},
        {"C":   {"type": "letterC", "score": 42}},
        {"CC":  {"type": "letterC", "score": 42}},
        {"CCC": {"type": "letterC", "score": 42}},
        {"D":   {"type": "letterD", "score": 42}},
        {"DD":  {"type": "letterD", "score": 42}},
        {"DDD": {"type": "letterD", "score": 42}},
        {"E":   {"type": "letterE", "score": 42}},
        {"EE":  {"type": "letterE", "score": 42}},
        {"EEE": {"type": "letterE", "score": 42}}]

    RTEXT = """<div>
    <p id="1">lala A la lala AA BB B la C lalala DDD D E</p>
    <p id="2">la E EE AA lal CC C la la BB la DD D lala EE la</p>
    <p id="3">B la BB EEE A la CCC B la DDD C lala AAA D la BBB E</p>

    cfg = get_config()
    cfg['settings']['return_applied_links'] = True

    number_of_links_to_apply = 5
    cfg['rules']['replaces_at_all'] = number_of_links_to_apply

    cfg['markup'] = {
        'anchor_pattern': '<a class="anchorman">{token}</a>',
        'decorate_anchor_key': 'the_anchor',
        'decorate': {
            'decorate_pattern': '<span type="{type}">{the_anchor}</span>',
            'decorate_anchor_key': 'the_anchor'

    annotated, applied, rest = annotate(RTEXT, RLINKS, config=cfg)
    assert len(applied) == number_of_links_to_apply

    expected = """<div> <p id="1">lala <span type="letterA"><a class="anchorman">A</a></span> la lala <span type="letterA"><a class="anchorman">AA</a></span> <span type="letterB"><a class="anchorman">BB</a></span> <span type="letterB"><a class="anchorman">B</a></span> la <span type="letterC"><a class="anchorman">C</a></span> lalala <span type="letterD">DDD</span> <span type="letterD">D</span> <span type="letterE">E</span></p> <p id="2">la <span type="letterE">E</span> <span type="letterE">EE</span> <span type="letterA">AA</span> lal <span type="letterC">CC</span> <span type="letterC">C</span> la la <span type="letterB">BB</span> la <span type="letterD">DD</span> <span type="letterD">D</span> lala <span type="letterE">EE</span> la</p> <p id="3"><span type="letterB">B</span> la <span type="letterB">BB</span> <span type="letterE">EEE</span> <span type="letterA">A</span> la <span type="letterC">CCC</span> <span type="letterB">B</span> la <span type="letterD">DDD</span> <span type="letterC">C</span> lala <span type="letterA">AAA</span> <span type="letterD">D</span> la <span type="letterB">BBB</span> <span type="letterE">E</span></p> </div>"""

    from tests.utils import fix_bs4_parsing_spaces, compare_results
    a = fix_bs4_parsing_spaces(annotated)
    b = fix_bs4_parsing_spaces(expected)
    # compare_results(a, b)
    assert a == b
def test_clean_annotatation():
    """Test removal by specific mode of annotation."""

    cfg = get_config()
    cfg['settings'].update({'mode': 'highlight'})
    two_paragraphs = DATA['test_paragraphs']['content'].encode('utf-8')
    highlight_elements = DATA['elements']
    annotated = annotate(two_paragraphs, highlight_elements, config=cfg)

    # change config
    cfg['settings'].update({'mode': 'unknown'})

    with pytest.raises(NotImplementedError):
        success, cleared_text = clean(annotated, config=cfg)
Esempio n. 6
def test_wiki_linking():

    from data.wiki_links import links
    with open('tests/data/wikibody_unlinked.html') as f:
        text =

    cfg = get_config()
    cfg['markup']['attributes'] = {"class": "anchorman", "data-entity": "link"}
    annotated = annotate(text, links, config=cfg)
    print len(links)

    # get rest and check
    assert annotated.count('class="anchorman"') == 711

    content = open('tests/data/index.tmpl', 'r').read()
    open('tests/data/wikipedia_annotated.html', 'w').write(content + annotated)
Esempio n. 7
def test_context_awareness():
    """Test annotate elements with default and manipulated config."""

    text = """<p>Intel analysis shows <a href="/oldlink">Vladimir Putin</a> approved election hacking Vladimir Putin.</p>"""

    links = [{
        "Vladimir Putin": {
            "href": "/putin",
            "type": "person",
            "score": 100.42
    }, {
        "Putin": {
            "href": "/putin",
            "type": "person",
            "score": 100.42

    cfg = get_config()
    cfg['settings']['log_level'] = 'DEBUG'

    cfg['markup'] = {
        '<a class="anchorman" href="{href}" score="{score}" type="{type}">{token}</a>',
        'decorate_anchor_key': 'the_anchor'

    # use default settings
    annotated = annotate(text, links, config=cfg)
    expected = """<p>Intel analysis shows <a href="/oldlink">Vladimir Putin</a> approved election hacking <a class="anchorman" href="/putin" score="100.42" type="person">Vladimir Putin</a>.</p>"""
    assert annotated == expected

    cfg['rules']['items_per_unit'] = 1
    annotated2 = annotate(text, links, config=cfg)
    expected2 = """<p>Intel analysis shows <a href="/oldlink">Vladimir Putin</a> approved election hacking Vladimir Putin.</p>"""
    assert annotated2 == expected2

    cfg['rules']['items_per_unit'] = None
    cfg['rules']['replaces_per_element'] = {'number': 1, 'key': 'href'}

    annotated3 = annotate(text, links, config=cfg)
    expected3 = """<p>Intel analysis shows <a href="/oldlink">Vladimir Putin</a> approved election hacking Vladimir Putin.</p>"""
    assert annotated3 == expected3
def test_annotation_rules1():
    """Test annotate highlight with overwritten config."""

    cfg = get_config()

    overwrite = {
        'mode': 'highlight',
        'replaces_at_all': 3,
        'text_unit': {
            'number_of_items': 5,
            'key': 't',
            'name': 'text'}

    text = DATA['test_text']['content'].encode('utf-8')
    highlight_elements = DATA['elements']
    annotated = annotate(text, highlight_elements, config=cfg)

    text_annotated = DATA['test_text']['results']['text_annotated_rule1'].encode('utf-8')
    assert annotated == text_annotated
def test_annotate_highlight_rules2():
    """Test annotate highlight with overwritten config."""

    cfg = get_config()

    overwrite = {
        'mode': 'highlight',
        'replaces_at_all': 2,
        'case_sensitive': False,
        'text_unit': {
            'key': 't',
            'name': 'text'}

    text2 = DATA['test_text']['content'].encode('utf-8')
    highlight_elements = DATA['elements']
    annotated = annotate(text2, highlight_elements, config=cfg)

    text_annotated = DATA['test_text']['results']['text_annotated_rule2'].encode('utf-8')
    assert annotated == text_annotated
Esempio n. 10
def test_annotation_rules():
    """Test annotate elements with default and manipulated config."""

    RLINKS = [{
        "A": {
            "type": "letterA",
            "score": 42
    }, {
        "AA": {
            "type": "letterA",
            "score": 42
    }, {
        "AAA": {
            "type": "letterA",
            "score": 42
    }, {
        "B": {
            "type": "letterB",
            "score": 42
    }, {
        "BB": {
            "type": "letterB",
            "score": 42
    }, {
        "BBB": {
            "type": "letterB",
            "score": 42
    }, {
        "C": {
            "type": "letterC",
            "score": 42
    }, {
        "CC": {
            "type": "letterC",
            "score": 42
    }, {
        "CCC": {
            "type": "letterC",
            "score": 42
    }, {
        "D": {
            "type": "letterD",
            "score": 42
    }, {
        "DD": {
            "type": "letterD",
            "score": 42
    }, {
        "DDD": {
            "type": "letterD",
            "score": 42
    }, {
        "E": {
            "type": "letterE",
            "score": 42
    }, {
        "EE": {
            "type": "letterE",
            "score": 42
    }, {
        "EEE": {
            "type": "letterE",
            "score": 42

    RTEXT = """<div>
    <p id="1">lala A la lala AA BB B la C lalala DDD D E</p>
    <p id="2">la E EE AA lal CC C la la BB la DD D lala EE la</p>
    <p id="3">B la BB EEE A la CCC B la DDD C lala AAA D la BBB E</p>

    cfg = get_config()
    cfg['settings']['return_applied_links'] = True

    cfg['markup'] = {
        'anchor_pattern': '<a class="anchorman" type="{type}">{token}</a>',
        'decorate_anchor_key': 'the_anchor',

    number_of_links_to_apply = 5
    cfg['rules']['replaces_at_all'] = number_of_links_to_apply
    annotated, applied, rest = annotate(RTEXT, RLINKS, config=cfg)
    assert len(applied) == number_of_links_to_apply

    cfg['rules']['replaces_at_all'] = number_of_links_to_apply
    cfg['rules']['items_per_unit'] = 1
    annotated, applied, rest = annotate(RTEXT, RLINKS, config=cfg)
    assert len(applied) == 3

    cfg['rules']['items_per_unit'] = 2
    cfg['rules']['replaces_at_all'] = None
    cfg['rules']['replaces_by_attribute'] = {
        'key': 'type',
        'value_per_unit': 1
    annotated, applied, rest = annotate(RTEXT, RLINKS, config=cfg)
    assert len(applied) == 6

    cfg['rules']['items_per_unit'] = 2
    cfg['rules']['replaces_at_all'] = None
    cfg['rules']['n_times_key_value'] = {'key': 'type', 'value_overall': 1}
    del cfg['rules']['replaces_by_attribute']
    annotated, applied, rest = annotate(RTEXT, RLINKS, config=cfg)
    assert len(applied) == 5

    # filter_by_attribute:

    #     # strict: false   # check only one, true only valid if all match
    #     attributes:
    #         - key: type
    #           value: animal
    #         - key: score
    #           value: 10

    cfg = get_config()
    cfg['settings']['return_applied_links'] = True
    cfg['rules']['filter_by_attribute'] = {
        'attributes': [{
            'key': 'type',
            'value': 'letterA'
        }, {
            'key': 'score',
            'value': 42
    annotated, applied, rest = annotate(RTEXT, RLINKS, config=cfg)
    assert len(applied) == 0
Esempio n. 11
def test_annotate_settings():

    TEXT = """<div><p>Paris Hilton wasn't going to let a bit of snowfall ruin her trip to New York City.</p><p>The Stars Are Blind singer, who was born in the Big Apple, was snapped Friday boarding a vehicle amid snowy conditions to check out the on-goings as fashion fever overtakes the city that never sleeps for New York Fashion Week.</p><p>The 35-year-old socialite matched well with a black and red leather jacket in Paris.</p></div>"""

    RESULT = """<div><p>Paris Hilton wasn\'t going to let a bit of snowfall ruin her trip to New York City.</p><p>The Stars Are Blind singer, who was born in the Big Apple, was snapped Friday boarding a vehicle amid snowy conditions to check out the on-goings as fashion fever overtakes the city that never sleeps for New York Fashion Week.</p><p>The 35-year-old socialite matched well with a black and red leather jacket in <a class="anchorman" lemma="Paris" type="location">Paris</a>.</p></div>"""

    links = [{
        u'Paris': {
            'lemma': u'Paris',
            'type': 'location'
    }, {
        u'Paris Hilton': {
            'lemma': u'Paris Hilton',
            'type': 'person'

    cfg = get_config()

    cfg['markup'] = {
        '<a class="anchorman" lemma="{lemma}" type="{type}">{token}</a>',
        'decorate_anchor_key': 'the_anchor'

    rules = {
        'return_applied_links': True,
        # apply high score candidates first
        'sort_by_item_value': {
            'key': 'score',
            'default': 0
        # 'replaces_per_element': {
        #     'number': 1,
        #     'key': 'lemma'
        # },
        # 'replaces_at_all': 5, #self.max_links,
        # not available 'longest_match_first': False,
        # 'replaces': {
        #     'by_attribute': {
        #         'key': 'type',
        #         # 'value_per_unit': 1
        #         'value_overall': 2 #self.max_per_etype
        #     }
        # },
        'items_per_unit': 4,  #self.links_per_paragraph,
        'filter_by_attribute': {
            'attributes': [{
                'key': 'type',
                'value': 'person'

    settings = {
        # "log_level": "DEBUG",
        "return_applied_links": True,
        # "forbidden_areas": {
        #     "tags": ["img", "a"],
        # "classes": ["first", "p--heading-3"]
        # }


    annotated, applied, rest = annotate(TEXT, links, config=cfg)

    from tests.utils import compare_results

    RESULT = re.sub(" +", " ", RESULT)
    compare_results(annotated, RESULT)
    assert annotated == RESULT
Esempio n. 12
def test_anchor_format():
    """Test annotate elements with default and manipulated config."""

    RLINKS = [{
        "A": {
            "type": "letterA",
            "score": 42
    }, {
        "AA": {
            "type": "letterA",
            "score": 42
    }, {
        "AAA": {
            "type": "letterA",
            "score": 42
    }, {
        "B": {
            "type": "letterB",
            "score": 42
    }, {
        "BB": {
            "type": "letterB",
            "score": 42
    }, {
        "BBB": {
            "type": "letterB",
            "score": 42
    }, {
        "C": {
            "type": "letterC",
            "score": 42
    }, {
        "CC": {
            "type": "letterC",
            "score": 42
    }, {
        "CCC": {
            "type": "letterC",
            "score": 42
    }, {
        "D": {
            "type": "letterD",
            "score": 42
    }, {
        "DD": {
            "type": "letterD",
            "score": 42
    }, {
        "DDD": {
            "type": "letterD",
            "score": 42
    }, {
        "E": {
            "type": "letterE",
            "score": 42
    }, {
        "EE": {
            "type": "letterE",
            "score": 42
    }, {
        "EEE": {
            "type": "letterE",
            "score": 42

    RTEXT = """<div>
    <p id="1">lala A la lala AA BB B la C lalala DDD D E</p>
    <p id="2">la E EE AA lal CC C la la BB la DD D lala EE la</p>
    <p id="3">B la BB EEE A la CCC B la DDD C lala AAA D la BBB E</p>

    from anchorman import elements

    def my_format_element(a, b, c):
        return "RUMBLE"

    import copy

    newobj = copy.copy(elements.format_element)

    elements.format_element = my_format_element

    from anchorman import annotate, clean, get_config

    cfg = get_config()
    cfg['settings']['return_applied_links'] = True

    number_of_links_to_apply = 5
    cfg['rules']['replaces_at_all'] = number_of_links_to_apply
    cfg['markup']['decorate'] = {'tag': 'span'}

    annotated, applied, rest = annotate(RTEXT, RLINKS, config=cfg)
    assert len(applied) == number_of_links_to_apply

    expected = """<div>
    <p id="1">lala RUMBLE la lala RUMBLE RUMBLE RUMBLE la RUMBLE lalala RUMBLE RUMBLE RUMBLE</p>

    from tests.utils import fix_bs4_parsing_spaces, compare_results
    a = fix_bs4_parsing_spaces(annotated)
    b = fix_bs4_parsing_spaces(expected)
    # compare_results(a, b)
    assert a == b

    elements.format_element = newobj
Esempio n. 13
def test_annotate_settings():
    """Test annotate elements with default and manipulated config."""

    text = """<p class="first">Intel analysis shows Putin approved election hacking.</p>\n<p>Russian President Vladimir Putin told a group of <b>foreign policy experts</b> in southern Russia on Thursday that Donald Trump's "extravagant behavior" is just his way of getting his <a class="another one">message</a> across to voters.</p><p><img src="/image.png" title="Vladimir Putin"> The image shows him riding a bear in novo sibirsk.</p><p>And another paragraph about <a href="/link">Vladimir Putin</a> but there is a link already.</p>"""

    expected = """<p class="first"><a class="anchorman" href="/intel" score="33.33" type="company">Intel</a> analysis shows <a class="anchorman" href="/putin" score="100.42" type="person">Putin</a> approved election hacking.</p>\n<p>Russian President <a class="anchorman" href="/putin" score="100.42" type="person">Vladimir Putin</a> told a group of <b>foreign policy experts</b> in southern <a class="anchorman" href="/russia" score="23.12" type="place">Russia</a> on Thursday that <a class="anchorman" href="/trump" score="89.06" type="person">Donald Trump</a>'s "extravagant behavior" is just his way of getting his <a class="another one">message</a> across to voters.</p><p><img src="/image.png" title="Vladimir Putin"/> The image shows him riding a bear in novo sibirsk.</p><p>And another paragraph about <a href="/link">Vladimir Putin</a> but there is a link already.</p>"""

    # # use default settings
    # annotated = annotate(text, LINKS)
    # assert fix_bs4_parsing_spaces(annotated) == fix_bs4_parsing_spaces(expected)

    # # ---------------------------------
    # # 1. return applied links
    number_of_links_to_apply = 3
    cfg = get_config()

    cfg['markup'] = {
        'anchor_pattern': '<a class="anchorman" href="{href}" score="{score}" type="{type}">{token}</a>',
        'decorate_anchor_key': 'the_anchor',
        # incase to remove the anchors we need to identify them
        'remove_tag': 'a',
        'remove_by_attribute': {'class': 'anchorman'}

    cfg['settings']['return_applied_links'] = True
    cfg['rules']['replaces_at_all'] = number_of_links_to_apply

    annotated, applied, rest = annotate(text, LINKS, config=cfg)

    # Moscow and Election is not in rest, it is not found in the string

    assert len(applied) == number_of_links_to_apply
    assert len(rest) == len(LINKS) - number_of_links_to_apply - 2
    assert annotated.count('a class="anchorman"') == number_of_links_to_apply

    # clean up
    this_one = annotated + '<p><a class="sth anchorman sth">I stay</a></p>'
    cleaned = clean(this_one, config=cfg)

    assert 'class="anchorman"' not in cleaned
    assert 'a class="another one"' in cleaned
    assert 'a class="sth anchorman sth"' not in cleaned

    # # ---------------------------------
    # # 5.2 keyword Election in text election
    cfg['rules']['case_sensitive'] = False
    cfg['rules']['replaces_at_all'] = None

    annotated, applied_links, rest = annotate(text, LINKS, config=cfg)
    assert '<a class="anchorman" href="/election"' in annotated
    assert len(applied_links) == 6

    # # -------------------------------
    # # 3. items replace per paragraph

    # from now on, we count all existing links also
    cfg['rules']['items_per_unit'] = 1
    annotated, applied_links, rest = annotate(text, LINKS, config=cfg)
    assert len(applied_links) == 1

    # # -------------------------------
    # # 3. items replace per paragraph
    cfg['rules']['replaces_at_all'] = None
    cfg['rules']['items_per_unit'] = None

    n = 2
    annotated, applied_links, rest = annotate(text*n, LINKS*n, config=cfg)
    assert len(applied_links) == (len(LINKS)-1)*n

    n = 10
    annotated, applied_links, rest = annotate(text*n, LINKS*n, config=cfg)
    assert len(applied_links) == (len(LINKS)-1)*n

    # # -------------------------------
    # # 3. items replace at all
    cfg['rules']['replaces_per_element'] = {"number": 1, "key": "href"}
    cfg['rules']['replaces_at_all'] = None
    cfg['rules']['items_per_unit'] = None

    text2 = """<p>Intel analysis shows Putin approved election hacking.</p>\n<p>Russian President Vladimir Putin told a group of <b>foreign policy experts</b> in southern Russia on Thursday.</p><p>Vladimir Putin bought Intel stocks.</b>"""

    links2 = [
            "Vladimir Putin": {
                "href": "/putin", "type": "person", "score": 100.42
            "Putin": {
                "href": "/putin", "type": "person", "score": 100.42
            "Intel": {
                "href": "/intel", "type": "company", "score": 33.33

    annotated, applied_links, rest = annotate(text2, links2, config=cfg)
    assert len(applied_links) == 2