def test_wa_loader_with_known_entities(): loader = WebAnnotatorLoader(known_entities={'ORG'}) html = b"<html><body><p><span wa-subtypes='' wa-id='227' wa-type='ORG' class='WebAnnotator_org'>Scrapinghub</span> has an <b>office</b> in <span wa-subtypes='' wa-id='228' wa-type='CITY' class='WebAnnotator_org'>Montevideo</span></p></body></html>" tree = loader.loadbytes(html) res = lxml.html.tostring(tree) assert b'<html><body><p> __START_ORG__ Scrapinghub __END_ORG__ has an <b>office</b> in Montevideo</p></body></html>' in res
def test_wa_loader(): ld = WebAnnotatorLoader() tree = ld.load(os.path.join(os.path.dirname(__file__), 'data', 'wa1.html')) res = lxml.html.tostring(tree) assert b"<p> __START_ORG__ Scrapinghub __END_ORG__ has an <b>office</b> in __START_CITY__ Montevideo __END_CITY__ </p>" in res, res assert b"wa-" not in res, res assert b"WA-" not in res, res
def test_wa_loader(): ld = WebAnnotatorLoader() tree = ld.loadbytes(HTML) res = lxml.html.tostring(tree) assert "<p> __START_ORG__ Scrapinghub __END_ORG__ has an <b>office</b> in __START_CITY__ Montevideo __END_CITY__ </p>" in res assert "wa-" not in res, res assert "WA-" not in res, res
def test_wa_loader(): ld = WebAnnotatorLoader() tree = ld.load(os.path.join(os.path.dirname(__file__), 'data', 'wa1.html')) res = lxml.html.tostring(tree) assert "<p> __START_ORG__ Scrapinghub __END_ORG__ has an <b>office</b> in __START_CITY__ Montevideo __END_CITY__ </p>" in res, res assert "wa-" not in res, res assert "WA-" not in res, res
def _assert_entities(fragment, known_entities, expected): ld = WebAnnotatorLoader(known_entities=known_entities) tree = ld.loadbytes(fragment) tokenizer = HtmlTokenizer() html_tokens, tags = tokenizer.tokenize_single(tree) tokens = [html_token.token for html_token in html_tokens] assert expected == dict([(token, tag) for token, tag in zip(tokens, tags) if tag != 'O'])
def test_wa_loader_None_bug(): ld = WebAnnotatorLoader() tree = ld.load(os.path.join(os.path.dirname(__file__), 'data', 'wa2.html')) res = lxml.html.tostring(tree) assert b'<em>Inc.</em> __END_ORG__ </p>' in res, res