Exemple #1
0
def test_wa_loader_with_known_entities():

    loader = WebAnnotatorLoader(known_entities={'ORG'})
    html = b"<html><body><p><span wa-subtypes='' wa-id='227' wa-type='ORG' class='WebAnnotator_org'>Scrapinghub</span> has an <b>office</b> in <span wa-subtypes='' wa-id='228' wa-type='CITY' class='WebAnnotator_org'>Montevideo</span></p></body></html>"
    tree = loader.loadbytes(html)
    res = lxml.html.tostring(tree)
    assert b'<html><body><p> __START_ORG__ Scrapinghub __END_ORG__  has an <b>office</b> in Montevideo</p></body></html>' in res
def test_wa_loader():
    ld = WebAnnotatorLoader()
    tree = ld.loadbytes(HTML)
    res = lxml.html.tostring(tree)
    assert "<p> __START_ORG__ Scrapinghub __END_ORG__  has an <b>office</b> in  __START_CITY__ Montevideo __END_CITY__ </p>" in res
    assert "wa-" not in res, res
    assert "WA-" not in res, res
Exemple #3
0
def test_wa_loader_with_known_entities():

    loader = WebAnnotatorLoader(known_entities={'ORG'})
    html = b"<html><body><p><span wa-subtypes='' wa-id='227' wa-type='ORG' class='WebAnnotator_org'>Scrapinghub</span> has an <b>office</b> in <span wa-subtypes='' wa-id='228' wa-type='CITY' class='WebAnnotator_org'>Montevideo</span></p></body></html>"
    tree = loader.loadbytes(html)
    res = lxml.html.tostring(tree)
    assert b'<html><body><p> __START_ORG__ Scrapinghub __END_ORG__  has an <b>office</b> in Montevideo</p></body></html>' in res
Exemple #4
0
def _assert_entities(fragment, known_entities, expected):

    ld = WebAnnotatorLoader(known_entities=known_entities)
    tree = ld.loadbytes(fragment)
    tokenizer = HtmlTokenizer()

    html_tokens, tags = tokenizer.tokenize_single(tree)
    tokens = [html_token.token for html_token in html_tokens]
    assert expected == dict([(token, tag) for token, tag in zip(tokens, tags) if tag != 'O'])
Exemple #5
0
def _assert_entities(fragment, known_entities, expected):

    ld = WebAnnotatorLoader(known_entities=known_entities)
    tree = ld.loadbytes(fragment)
    tokenizer = HtmlTokenizer()

    html_tokens, tags = tokenizer.tokenize_single(tree)
    tokens = [html_token.token for html_token in html_tokens]
    assert expected == dict([(token, tag) for token, tag in zip(tokens, tags) if tag != 'O'])