def test_annotate_highlight(): """Test annotate with manipulated config and with mode highlight. Take two paragraphs and create a highlighted item for one item per paragraph of elements. """ cfg = get_config() highlight = { 'mode': 'highlight', 'text_unit': { 'items_per_unit': 1, 'key': 'p', 'name': 'html-paragraph' } } cfg['settings'].update(highlight) two_paragraphs = DATA['test_paragraphs']['content'].encode('utf-8') highlight_elements = DATA['elements'] annotated = annotate(two_paragraphs, highlight_elements, config=cfg) tpopa = DATA['test_paragraphs']['results']['one_per_unit_annotated'].encode('utf-8') assert annotated == tpopa success, cleared_text = clean(annotated, config=cfg) assert success assert two_paragraphs == cleared_text
def test_markup_unknown(): """Test annotate with key in settings but not in markup.""" cfg = get_config() cfg['settings'].update({'mode': 'hocuspocus'}) with pytest.raises(KeyError): _ = annotate('', DATA['elements'], config=cfg)
def test_annotate_own_validator_from_outside(): """Test annotate with an own validator.""" cfg = get_config() cfg['settings'].update({'mode': 'highlight'}) two_paragraphs = DATA['test_paragraphs']['content'].encode('utf-8') highlight_elements = DATA['elements'] annotated = annotate(two_paragraphs, highlight_elements, own_validator=[my_validator], config=cfg) expected_result = DATA['test_paragraphs']['results']['LA_annotated'].encode('utf-8') assert annotated == expected_result
def test_annotation_rules(): """Test annotate elements with default and manipulated config.""" RLINKS = [ {"A": {"type": "letterA", "score": 42}}, {"AA": {"type": "letterA", "score": 42}}, {"AAA": {"type": "letterA", "score": 42}}, {"B": {"type": "letterB", "score": 42}}, {"BB": {"type": "letterB", "score": 42}}, {"BBB": {"type": "letterB", "score": 42}}, {"C": {"type": "letterC", "score": 42}}, {"CC": {"type": "letterC", "score": 42}}, {"CCC": {"type": "letterC", "score": 42}}, {"D": {"type": "letterD", "score": 42}}, {"DD": {"type": "letterD", "score": 42}}, {"DDD": {"type": "letterD", "score": 42}}, {"E": {"type": "letterE", "score": 42}}, {"EE": {"type": "letterE", "score": 42}}, {"EEE": {"type": "letterE", "score": 42}}] RTEXT = """<div> <p id="1">lala A la lala AA BB B la C lalala DDD D E</p> <p id="2">la E EE AA lal CC C la la BB la DD D lala EE la</p> <p id="3">B la BB EEE A la CCC B la DDD C lala AAA D la BBB E</p> </div>""" cfg = get_config() cfg['settings']['return_applied_links'] = True number_of_links_to_apply = 5 cfg['rules']['replaces_at_all'] = number_of_links_to_apply cfg['markup'] = { 'anchor_pattern': '<a class="anchorman">{token}</a>', 'decorate_anchor_key': 'the_anchor', 'decorate': { 'decorate_pattern': '<span type="{type}">{the_anchor}</span>', 'decorate_anchor_key': 'the_anchor' } } annotated, applied, rest = annotate(RTEXT, RLINKS, config=cfg) assert len(applied) == number_of_links_to_apply expected = """<div> <p id="1">lala <span type="letterA"><a class="anchorman">A</a></span> la lala <span type="letterA"><a class="anchorman">AA</a></span> <span type="letterB"><a class="anchorman">BB</a></span> <span type="letterB"><a class="anchorman">B</a></span> la <span type="letterC"><a class="anchorman">C</a></span> lalala <span type="letterD">DDD</span> <span type="letterD">D</span> <span type="letterE">E</span></p> <p id="2">la <span type="letterE">E</span> <span type="letterE">EE</span> <span type="letterA">AA</span> lal <span type="letterC">CC</span> <span type="letterC">C</span> la la <span type="letterB">BB</span> la <span type="letterD">DD</span> <span type="letterD">D</span> lala <span type="letterE">EE</span> la</p> <p id="3"><span type="letterB">B</span> la <span type="letterB">BB</span> <span type="letterE">EEE</span> <span type="letterA">A</span> la <span type="letterC">CCC</span> <span type="letterB">B</span> la <span type="letterD">DDD</span> <span type="letterC">C</span> lala <span type="letterA">AAA</span> <span type="letterD">D</span> la <span type="letterB">BBB</span> <span type="letterE">E</span></p> </div>""" from tests.utils import fix_bs4_parsing_spaces, compare_results a = fix_bs4_parsing_spaces(annotated) b = fix_bs4_parsing_spaces(expected) # compare_results(a, b) assert a == b
def test_clean_annotatation(): """Test removal by specific mode of annotation.""" cfg = get_config() cfg['settings'].update({'mode': 'highlight'}) two_paragraphs = DATA['test_paragraphs']['content'].encode('utf-8') highlight_elements = DATA['elements'] annotated = annotate(two_paragraphs, highlight_elements, config=cfg) # change config cfg['settings'].update({'mode': 'unknown'}) with pytest.raises(NotImplementedError): success, cleared_text = clean(annotated, config=cfg)
def test_wiki_linking(): from data.wiki_links import links with open('tests/data/wikibody_unlinked.html') as f: text = f.read() cfg = get_config() cfg['markup']['attributes'] = {"class": "anchorman", "data-entity": "link"} annotated = annotate(text, links, config=cfg) print len(links) # get rest and check assert annotated.count('class="anchorman"') == 711 content = open('tests/data/index.tmpl', 'r').read() open('tests/data/wikipedia_annotated.html', 'w').write(content + annotated)
def test_context_awareness(): """Test annotate elements with default and manipulated config.""" text = """<p>Intel analysis shows <a href="/oldlink">Vladimir Putin</a> approved election hacking Vladimir Putin.</p>""" links = [{ "Vladimir Putin": { "href": "/putin", "type": "person", "score": 100.42 } }, { "Putin": { "href": "/putin", "type": "person", "score": 100.42 } }] cfg = get_config() cfg['settings']['log_level'] = 'DEBUG' cfg['markup'] = { 'anchor_pattern': '<a class="anchorman" href="{href}" score="{score}" type="{type}">{token}</a>', 'decorate_anchor_key': 'the_anchor' } # use default settings annotated = annotate(text, links, config=cfg) expected = """<p>Intel analysis shows <a href="/oldlink">Vladimir Putin</a> approved election hacking <a class="anchorman" href="/putin" score="100.42" type="person">Vladimir Putin</a>.</p>""" assert annotated == expected cfg['rules']['items_per_unit'] = 1 annotated2 = annotate(text, links, config=cfg) expected2 = """<p>Intel analysis shows <a href="/oldlink">Vladimir Putin</a> approved election hacking Vladimir Putin.</p>""" assert annotated2 == expected2 cfg['rules']['items_per_unit'] = None cfg['rules']['replaces_per_element'] = {'number': 1, 'key': 'href'} annotated3 = annotate(text, links, config=cfg) expected3 = """<p>Intel analysis shows <a href="/oldlink">Vladimir Putin</a> approved election hacking Vladimir Putin.</p>""" assert annotated3 == expected3
def test_annotation_rules1(): """Test annotate highlight with overwritten config.""" cfg = get_config() overwrite = { 'mode': 'highlight', 'replaces_at_all': 3, 'text_unit': { 'number_of_items': 5, 'key': 't', 'name': 'text'} } cfg['settings'].update(overwrite) text = DATA['test_text']['content'].encode('utf-8') highlight_elements = DATA['elements'] annotated = annotate(text, highlight_elements, config=cfg) text_annotated = DATA['test_text']['results']['text_annotated_rule1'].encode('utf-8') assert annotated == text_annotated
def test_annotate_highlight_rules2(): """Test annotate highlight with overwritten config.""" cfg = get_config() overwrite = { 'mode': 'highlight', 'replaces_at_all': 2, 'case_sensitive': False, 'text_unit': { 'key': 't', 'name': 'text'} } cfg['settings'].update(overwrite) text2 = DATA['test_text']['content'].encode('utf-8') highlight_elements = DATA['elements'] annotated = annotate(text2, highlight_elements, config=cfg) text_annotated = DATA['test_text']['results']['text_annotated_rule2'].encode('utf-8') assert annotated == text_annotated
def test_annotation_rules(): """Test annotate elements with default and manipulated config.""" RLINKS = [{ "A": { "type": "letterA", "score": 42 } }, { "AA": { "type": "letterA", "score": 42 } }, { "AAA": { "type": "letterA", "score": 42 } }, { "B": { "type": "letterB", "score": 42 } }, { "BB": { "type": "letterB", "score": 42 } }, { "BBB": { "type": "letterB", "score": 42 } }, { "C": { "type": "letterC", "score": 42 } }, { "CC": { "type": "letterC", "score": 42 } }, { "CCC": { "type": "letterC", "score": 42 } }, { "D": { "type": "letterD", "score": 42 } }, { "DD": { "type": "letterD", "score": 42 } }, { "DDD": { "type": "letterD", "score": 42 } }, { "E": { "type": "letterE", "score": 42 } }, { "EE": { "type": "letterE", "score": 42 } }, { "EEE": { "type": "letterE", "score": 42 } }] RTEXT = """<div> <p id="1">lala A la lala AA BB B la C lalala DDD D E</p> <p id="2">la E EE AA lal CC C la la BB la DD D lala EE la</p> <p id="3">B la BB EEE A la CCC B la DDD C lala AAA D la BBB E</p> </div> """ cfg = get_config() cfg['settings']['return_applied_links'] = True cfg['markup'] = { 'anchor_pattern': '<a class="anchorman" type="{type}">{token}</a>', 'decorate_anchor_key': 'the_anchor', } number_of_links_to_apply = 5 cfg['rules']['replaces_at_all'] = number_of_links_to_apply annotated, applied, rest = annotate(RTEXT, RLINKS, config=cfg) assert len(applied) == number_of_links_to_apply cfg['rules']['replaces_at_all'] = number_of_links_to_apply cfg['rules']['items_per_unit'] = 1 annotated, applied, rest = annotate(RTEXT, RLINKS, config=cfg) assert len(applied) == 3 cfg['rules']['items_per_unit'] = 2 cfg['rules']['replaces_at_all'] = None cfg['rules']['replaces_by_attribute'] = { 'key': 'type', 'value_per_unit': 1 } annotated, applied, rest = annotate(RTEXT, RLINKS, config=cfg) assert len(applied) == 6 cfg['rules']['items_per_unit'] = 2 cfg['rules']['replaces_at_all'] = None cfg['rules']['n_times_key_value'] = {'key': 'type', 'value_overall': 1} del cfg['rules']['replaces_by_attribute'] annotated, applied, rest = annotate(RTEXT, RLINKS, config=cfg) assert len(applied) == 5 # filter_by_attribute: # # strict: false # check only one, true only valid if all match # attributes: # - key: type # value: animal # - key: score # value: 10 cfg = get_config() cfg['settings']['return_applied_links'] = True cfg['rules']['filter_by_attribute'] = { 'attributes': [{ 'key': 'type', 'value': 'letterA' }, { 'key': 'score', 'value': 42 }] } annotated, applied, rest = annotate(RTEXT, RLINKS, config=cfg) assert len(applied) == 0
def test_annotate_settings(): TEXT = """<div><p>Paris Hilton wasn't going to let a bit of snowfall ruin her trip to New York City.</p><p>The Stars Are Blind singer, who was born in the Big Apple, was snapped Friday boarding a vehicle amid snowy conditions to check out the on-goings as fashion fever overtakes the city that never sleeps for New York Fashion Week.</p><p>The 35-year-old socialite matched well with a black and red leather jacket in Paris.</p></div>""" RESULT = """<div><p>Paris Hilton wasn\'t going to let a bit of snowfall ruin her trip to New York City.</p><p>The Stars Are Blind singer, who was born in the Big Apple, was snapped Friday boarding a vehicle amid snowy conditions to check out the on-goings as fashion fever overtakes the city that never sleeps for New York Fashion Week.</p><p>The 35-year-old socialite matched well with a black and red leather jacket in <a class="anchorman" lemma="Paris" type="location">Paris</a>.</p></div>""" links = [{ u'Paris': { 'lemma': u'Paris', 'type': 'location' } }, { u'Paris Hilton': { 'lemma': u'Paris Hilton', 'type': 'person' } }] cfg = get_config() cfg['markup'] = { 'anchor_pattern': '<a class="anchorman" lemma="{lemma}" type="{type}">{token}</a>', 'decorate_anchor_key': 'the_anchor' } rules = { 'return_applied_links': True, # apply high score candidates first 'sort_by_item_value': { 'key': 'score', 'default': 0 }, # 'replaces_per_element': { # 'number': 1, # 'key': 'lemma' # }, # 'replaces_at_all': 5, #self.max_links, # not available 'longest_match_first': False, # 'replaces': { # 'by_attribute': { # 'key': 'type', # # 'value_per_unit': 1 # 'value_overall': 2 #self.max_per_etype # } # }, 'items_per_unit': 4, #self.links_per_paragraph, 'filter_by_attribute': { 'attributes': [{ 'key': 'type', 'value': 'person' }] } } settings = { # "log_level": "DEBUG", "return_applied_links": True, # "forbidden_areas": { # "tags": ["img", "a"], # "classes": ["first", "p--heading-3"] # } } cfg['settings'].update(settings) cfg['rules'].update(rules) annotated, applied, rest = annotate(TEXT, links, config=cfg) from tests.utils import compare_results RESULT = re.sub(" +", " ", RESULT) compare_results(annotated, RESULT) assert annotated == RESULT
def test_anchor_format(): """Test annotate elements with default and manipulated config.""" RLINKS = [{ "A": { "type": "letterA", "score": 42 } }, { "AA": { "type": "letterA", "score": 42 } }, { "AAA": { "type": "letterA", "score": 42 } }, { "B": { "type": "letterB", "score": 42 } }, { "BB": { "type": "letterB", "score": 42 } }, { "BBB": { "type": "letterB", "score": 42 } }, { "C": { "type": "letterC", "score": 42 } }, { "CC": { "type": "letterC", "score": 42 } }, { "CCC": { "type": "letterC", "score": 42 } }, { "D": { "type": "letterD", "score": 42 } }, { "DD": { "type": "letterD", "score": 42 } }, { "DDD": { "type": "letterD", "score": 42 } }, { "E": { "type": "letterE", "score": 42 } }, { "EE": { "type": "letterE", "score": 42 } }, { "EEE": { "type": "letterE", "score": 42 } }] RTEXT = """<div> <p id="1">lala A la lala AA BB B la C lalala DDD D E</p> <p id="2">la E EE AA lal CC C la la BB la DD D lala EE la</p> <p id="3">B la BB EEE A la CCC B la DDD C lala AAA D la BBB E</p> </div>""" from anchorman import elements def my_format_element(a, b, c): return "RUMBLE" import copy newobj = copy.copy(elements.format_element) elements.format_element = my_format_element from anchorman import annotate, clean, get_config cfg = get_config() cfg['settings']['return_applied_links'] = True number_of_links_to_apply = 5 cfg['rules']['replaces_at_all'] = number_of_links_to_apply cfg['markup']['decorate'] = {'tag': 'span'} annotated, applied, rest = annotate(RTEXT, RLINKS, config=cfg) assert len(applied) == number_of_links_to_apply expected = """<div> <p id="1">lala RUMBLE la lala RUMBLE RUMBLE RUMBLE la RUMBLE lalala RUMBLE RUMBLE RUMBLE</p> <p id="2">la RUMBLE RUMBLE RUMBLE lal RUMBLE RUMBLE la la RUMBLE la RUMBLE RUMBLE lala RUMBLE la</p> <p id="3">RUMBLE la RUMBLE RUMBLE RUMBLE la RUMBLE RUMBLE la RUMBLE RUMBLE lala RUMBLE RUMBLE la RUMBLE RUMBLE</p> </div>""" from tests.utils import fix_bs4_parsing_spaces, compare_results a = fix_bs4_parsing_spaces(annotated) b = fix_bs4_parsing_spaces(expected) # compare_results(a, b) assert a == b elements.format_element = newobj
def test_annotate_settings(): """Test annotate elements with default and manipulated config.""" text = """<p class="first">Intel analysis shows Putin approved election hacking.</p>\n<p>Russian President Vladimir Putin told a group of <b>foreign policy experts</b> in southern Russia on Thursday that Donald Trump's "extravagant behavior" is just his way of getting his <a class="another one">message</a> across to voters.</p><p><img src="/image.png" title="Vladimir Putin"> The image shows him riding a bear in novo sibirsk.</p><p>And another paragraph about <a href="/link">Vladimir Putin</a> but there is a link already.</p>""" expected = """<p class="first"><a class="anchorman" href="/intel" score="33.33" type="company">Intel</a> analysis shows <a class="anchorman" href="/putin" score="100.42" type="person">Putin</a> approved election hacking.</p>\n<p>Russian President <a class="anchorman" href="/putin" score="100.42" type="person">Vladimir Putin</a> told a group of <b>foreign policy experts</b> in southern <a class="anchorman" href="/russia" score="23.12" type="place">Russia</a> on Thursday that <a class="anchorman" href="/trump" score="89.06" type="person">Donald Trump</a>'s "extravagant behavior" is just his way of getting his <a class="another one">message</a> across to voters.</p><p><img src="/image.png" title="Vladimir Putin"/> The image shows him riding a bear in novo sibirsk.</p><p>And another paragraph about <a href="/link">Vladimir Putin</a> but there is a link already.</p>""" # # use default settings # annotated = annotate(text, LINKS) # assert fix_bs4_parsing_spaces(annotated) == fix_bs4_parsing_spaces(expected) # # --------------------------------- # # 1. return applied links number_of_links_to_apply = 3 cfg = get_config() cfg['markup'] = { 'anchor_pattern': '<a class="anchorman" href="{href}" score="{score}" type="{type}">{token}</a>', 'decorate_anchor_key': 'the_anchor', # incase to remove the anchors we need to identify them 'remove_tag': 'a', 'remove_by_attribute': {'class': 'anchorman'} } cfg['settings']['return_applied_links'] = True cfg['rules']['replaces_at_all'] = number_of_links_to_apply annotated, applied, rest = annotate(text, LINKS, config=cfg) # Moscow and Election is not in rest, it is not found in the string assert len(applied) == number_of_links_to_apply assert len(rest) == len(LINKS) - number_of_links_to_apply - 2 assert annotated.count('a class="anchorman"') == number_of_links_to_apply # clean up this_one = annotated + '<p><a class="sth anchorman sth">I stay</a></p>' cleaned = clean(this_one, config=cfg) assert 'class="anchorman"' not in cleaned assert 'a class="another one"' in cleaned assert 'a class="sth anchorman sth"' not in cleaned # # --------------------------------- # # 5.2 keyword Election in text election cfg['rules']['case_sensitive'] = False cfg['rules']['replaces_at_all'] = None annotated, applied_links, rest = annotate(text, LINKS, config=cfg) assert '<a class="anchorman" href="/election"' in annotated assert len(applied_links) == 6 # # ------------------------------- # # 3. items replace per paragraph # from now on, we count all existing links also cfg['rules']['items_per_unit'] = 1 annotated, applied_links, rest = annotate(text, LINKS, config=cfg) assert len(applied_links) == 1 # # ------------------------------- # # 3. items replace per paragraph cfg['rules']['replaces_at_all'] = None cfg['rules']['items_per_unit'] = None n = 2 annotated, applied_links, rest = annotate(text*n, LINKS*n, config=cfg) assert len(applied_links) == (len(LINKS)-1)*n n = 10 annotated, applied_links, rest = annotate(text*n, LINKS*n, config=cfg) assert len(applied_links) == (len(LINKS)-1)*n # # ------------------------------- # # 3. items replace at all cfg['rules']['replaces_per_element'] = {"number": 1, "key": "href"} cfg['rules']['replaces_at_all'] = None cfg['rules']['items_per_unit'] = None text2 = """<p>Intel analysis shows Putin approved election hacking.</p>\n<p>Russian President Vladimir Putin told a group of <b>foreign policy experts</b> in southern Russia on Thursday.</p><p>Vladimir Putin bought Intel stocks.</b>""" links2 = [ { "Vladimir Putin": { "href": "/putin", "type": "person", "score": 100.42 } }, { "Putin": { "href": "/putin", "type": "person", "score": 100.42 } }, { "Intel": { "href": "/intel", "type": "company", "score": 33.33 } } ] annotated, applied_links, rest = annotate(text2, links2, config=cfg) assert len(applied_links) == 2