def test_get_title(): assert HTMLDocument( """<html><head><title>Test title</title></head><body>x</body></html>""" ).parse().get_title() == "Test title" assert HTMLDocument("""<html><title>Test title</title>XX</html>""").parse( ).get_title() == "Test title" assert HTMLDocument( """<html><head><title>Test title</title></head><body><title>x</title></body></html>""" ).parse().get_title() == "Test title"
def test_get_url_words(): doc = HTMLDocument("", url="http://www.nytimes.com/2011/10/06/arts/music/maceo-parker.html?print=true#hash").parse() assert doc.get_url_words() == [ "nytimes", "com", "2011", "10", "06", "arts", "music", "maceo", "parker", "html" ] doc = HTMLDocument("", url="https://en.wikipedia.org/wiki/Nine_Inch_Nails").parse() assert doc.get_url_words() == [ "en", "wikipedia", "org", "wiki", "nine", "inch", "nails" ]
def test_get_canonical_url(): html = """<html><head></head><body>x</body></html>""" page = HTMLDocument(html, url="http://example.com/page.html").parse() assert page.parse_canonical_url() is None html = """<html><head><link rel="canonical" href="" /></head><body>x</body></html>""" page = HTMLDocument(html, url="http://example.com/page.html").parse() assert page.parse_canonical_url() is None html = """<html><head><link rel="canonical" href="http://example.com/page2.html" /></head><body>x</body></html>""" page = HTMLDocument(html, url="http://example.com/page.html").parse() assert page.parse_canonical_url().url == "http://example.com/page2.html" html = """<html><head><linK reL="caNonical" hreF="http://example.com/page2.html" /></head><body>x</body></html>""" page = HTMLDocument(html, url="http://example.com/page.html").parse() assert page.parse_canonical_url().url == "http://example.com/page2.html" # Cross domain blocked for now html = """<html><head><linK reL="caNonical" hreF="http://example2.com/page2.html" /></head><body>x</body></html>""" page = HTMLDocument(html, url="http://example.com/page.html").parse() assert page.parse_canonical_url() is None # Relative URLs html = """<html><head><linK reL="caNonical" hreF="/dir2/page2.html" /></head><body>x</body></html>""" page = HTMLDocument(html, url="http://example.com/dir/page.html").parse() assert page.parse_canonical_url().url == "http://example.com/dir2/page2.html" html = """<html><head><linK reL="caNonical" hreF="dir2/page2.html" /></head><body>x</body></html>""" page = HTMLDocument(html, url="http://example.com/dir/page.html").parse() assert page.parse_canonical_url().url == "http://example.com/dir/dir2/page2.html" html = """<html><head><linK reL="caNonical" hreF="//example.com/dir2/page2.html" /></head><body>x</body></html>""" page = HTMLDocument(html, url="http://example.com/dir/page.html").parse() assert page.parse_canonical_url().url == "http://example.com/dir2/page2.html"
def test_encoding_aliases(): from cosrlib.document.html import HTMLDocument doc = HTMLDocument( """<html><head><meta charset="tis-620"></head><body>Hello</body></html>""" ) assert doc.encoding.detect().name == "cp874" doc.parse() doc = HTMLDocument( """<html><head><meta charset="windows-874"></head><body>Hello</body></html>""" ) assert doc.encoding.detect().name == "cp874" doc.parse()
def format_html_title(title, url=None): doc = HTMLDocument(""" <html><head><title>%s</title></head><body>Hello</body></html> """ % title, url=url) doc.parse() return format_title(doc, {})
def test_parsing_samples(sample_name): metadata = SAMPLES[sample_name] sample_file = "tests/testdata/html_page_samples/%s" % sample_name with open(sample_file, "r") as f: html = f.read() page = HTMLDocument(html).parse() if "title" in metadata: assert metadata["title"] == page.get_title() if "summary" in metadata: assert metadata["summary"] == page.get_summary() # for k, g in sorted(page.get_word_groups().items()): # print k, g words = page.get_all_words() lower_words_set = set([w.lower() for w in words]) # Uncomment this to debug if metadata.get("debug"): print words for word in metadata.get("assert_words_missing", []): assert word not in lower_words_set for word in metadata.get("assert_words", []): assert word in lower_words_set
def test_get_url(): # When none is given, we take the URL html = """<html><head></head><body>x</body></html>""" page = HTMLDocument(html, url="http://example.com/page.html").parse() assert page.get_url().url == "http://example.com/page.html" # But when a tag is present, it has precedence html = """<html><head><link rel="canonical" href="http://example.com/page2.html" /></head><body>x</body></html>""" page = HTMLDocument(html, url="http://example.com/page.html").parse() assert page.get_url().url == "http://example.com/page2.html" # Including with strange caps html = """<htmL><heaD><linK reL="CANonical" hreF="http://example.com/Page2.html" /></head><body>x</body></html>""" page = HTMLDocument(html, url="http://example.com/page.html").parse() assert page.get_url().url == "http://example.com/Page2.html"
def test_get_domain_paid_words(): doc = HTMLDocument( "", url= "http://www.bbc.co.uk/2011/10/06/arts/music/maceo-parker.html?print=true" ) assert doc.get_domain_paid_words() == ["bbc"]
def test_encoding_x_user_defined(): from cosrlib.document.html import HTMLDocument doc = HTMLDocument( """<html><head><meta charset="x-user-defined"></head><body>Hello</body></html>""" ) assert doc.encoding.detect().name == "x-user-defined" doc.parse()
def test_encoding_xml(): from cosrlib.document.html import HTMLDocument doc = HTMLDocument( """<?xml version="1.0" encoding="shift_jis"?><!DOCTYPE html> <html lang="en" ></html> """) assert doc.encoding.detect().name == "shift_jis" doc.parse()
def test_get_hyperlinks(): # When none is given, we take the URL html = """<html><head></head><body>before <a href="http://example.com/page1">link text</a> after</body></html>""" page = HTMLDocument(html, url="http://example.com/page.html").parse() links = page.get_hyperlinks() assert len(links) == 1 assert links[0]["href"].url == "http://example.com/page1" assert links[0]["words"] == ["link", "text"]
def _test_mozilla_readability_testcases(test_name): with open("%s/%s/source.html" % (MOZILLA_READABILITY_DIR, test_name)) as fhtml: with open("%s/%s/expected.html" % (MOZILLA_READABILITY_DIR, test_name)) as fsimple: html = fhtml.read() simple = fsimple.read() doc = HTMLDocument(html).parse() doc_simple = HTMLDocument(simple).parse() doc_word_groups = [] for wg in doc.analysis["word_groups"]: doc_word_groups.append(wg[0]) doc_simple_word_groups = [] for wg in doc_simple.analysis["word_groups"]: doc_simple_word_groups.append(wg[0]) assert doc_word_groups == doc_simple_word_groups
def test_get_word_groups(sample): page = HTMLDocument(sample["html"]).parse() word_groups = page.get_word_groups() for i, group in enumerate(word_groups): sample_group = sample["groups"][i] assert group["tag"] == sample_group[0] assert group["words"] == sample_group[1] assert len(word_groups) == len(sample["groups"])
def test_reparse(): from cosrlib.document.html import HTMLDocument doc = HTMLDocument( """<html><head><meta charset="iso-8859-15"><title>Mac\xe9o</title></head></html>""" ) assert doc.encoding.detect().name == "iso8859-15" # A re-parsing of the document should be triggered, gumbo only accepts utf-8 doc.parse() assert doc.get_title() == "Mac\xc3\xa9o"
def test_format_summary(): html = """<html> <head><meta name="Description" content=" This is a <summary>!" /></head> <body>This is <body> text</body> </html>""" page = HTMLDocument(html).parse() assert format_summary(page, {}) == "This is a <summary>!" html = """<html> <head><meta property="og:description" content=" This is a <summary>!" /></head> <body>This is <body> text</body> </html>""" page = HTMLDocument(html).parse() assert format_summary(page, {}) == "This is a <summary>!" html = """<html> <head><meta name="Description" content="" /></head> <body> <div>This is <body> text, very detailed, very long xxxxxxxxx! </div></body> </html>""" page = HTMLDocument(html).parse() assert format_summary( page, {}) == "This is <body> text, very detailed, very long xxxxxxxxx!" html = """<html> <head><meta name="Description" content="" /></head> <body> <div>This is <body> text, very detailed, very long xxxxxxxxx! </div> <h1>But there is a more informative title! Use it</h1> </body> </html>""" page = HTMLDocument(html).parse() assert format_summary( page, {}) == "But there is a more informative title! Use it"
def _test_newspaper_testcases(test_name): with open("%s/html/%s" % (NEWSPAPER_DIR, test_name)) as fhtml: with open("%s/text/%s" % (NEWSPAPER_DIR, test_name.replace(".html", ".txt"))) as ftxt: html = fhtml.read() txt = ftxt.read() word_groups = txt.split("\n\n") doc = HTMLDocument(html).parse() doc_word_groups = [] for wg in doc.analysis["word_groups"]: doc_word_groups.append(wg[0]) assert doc_word_groups == word_groups
def test_hidden_text(): html = """<html><head></head><body> <script> hello(); </script> <style> style { good } </style> <!-- comment --> text <p>p</p> <div style='display: none;'>hidden by display</div> <div hidden>hidden by html5 attribute</div> <div aria-hidden="true">hidden by aria</div> <div aria-hidden="false">not_aria</div> <div style='visibility: hidden;'>hidden by visibility</div> </body></html>""" page = HTMLDocument(html).parse() assert page.get_all_words() == set(["text", "p", "not_aria"])
def test_get_hyperlinks(): html = """<html><head></head><body> before <a href="http://example.com/page1">link text</a> after <a href="/page2">relative2</a> <a href="page3?q=1#d">relative3</a> <a href="http://other.example.com/page4">absolute4</a> <a href="//other.example.com/page5?q=1#d">absolute5</a> <a href="https://other.example.com/page6?q=1#d">absolute6</a> <a href="javascript:func()">js1</a> </body></html>""" page = HTMLDocument(html, url="http://example.com/page.html").parse() links = page.get_external_hyperlinks() assert len(links) == 3 assert links[0]["href"].url == "http://other.example.com/page4" assert links[0]["text"] == "absolute4" assert links[1]["href"].url == "http://other.example.com/page5?q=1#d" assert links[1]["text"] == "absolute5" assert links[2]["href"].url == "https://other.example.com/page6?q=1#d" assert links[2]["text"] == "absolute6" # This doesn't return URLs, it returns strings (they are paths) links = page.get_internal_hyperlinks() assert len(links) == 3 assert links[0]["path"] == "/page1" assert links[0]["text"] == "link text" assert links[1]["path"] == "/page2" assert links[1]["text"] == "relative2" assert links[2]["path"] == "page3?q=1#d" assert links[2]["text"] == "relative3" # All links in absolute links = page.get_hyperlinks() assert len(links) == 6 assert links[2]["href"].url == "http://example.com/page3?q=1#d"
def test_format_title(): def format_html_title(title, url=None): doc = HTMLDocument(""" <html><head><meta charset="UTF-8"><title>%s</title></head><body>Hello</body></html> """ % title, url=url) doc.parse() return format_title(doc, {}) assert format_html_title("A Title!") == "A Title!" assert format_html_title(" A \n Title\t \t! ") == "A Title !" assert format_html_title("a" * 100) == ("a" * 70) + "..." # # Test that emoji chararacters and symbols are removed from titles emoji_title = u"😋 Super Emoji-Land.com " "" emoji_title = emoji_title.encode('utf8') emoji_title = format_html_title(emoji_title) assert emoji_title == "Super Emoji-Land.com" assert format_html_title(("a" * 60) + " 2345678 1234567") == ("a" * 60) + " 2345678..." assert format_html_title(("a" * 60) + " 234567890 1234567") == ("a" * 60) + " 234567890..." assert format_html_title(("a" * 60) + " 2345678901 1234567") == ("a" * 60) + "..." # Test domain fallback assert format_html_title( " ", url="http://www.example.com/hello.html") == "Example" # Test blacklist assert format_html_title( " home ", url="http://www.example.com/hello.html") == "Example" # Test OGP html = """<html> <head><meta property="og:title" content="Open graph title " /></head> <body>This is <body> text</body> </html>""" page = HTMLDocument(html).parse() assert format_title(page, {}) == "Open graph title"
def test_format_title(): def format_html_title(title, url=None): doc = HTMLDocument(""" <html><head><title>%s</title></head><body>Hello</body></html> """ % title, url=url) doc.parse() return format_title(doc, {}) assert format_html_title("A Title!") == "A Title!" assert format_html_title(" A \n Title\t \t! ") == "A Title !" assert format_html_title("a" * 100) == ("a" * 70) + "..." assert format_html_title(("a" * 60) + " 2345678 1234567") == ("a" * 60) + " 2345678..." assert format_html_title(("a" * 60) + " 234567890 1234567") == ("a" * 60) + " 234567890..." assert format_html_title(("a" * 60) + " 2345678901 1234567") == ("a" * 60) + "..." # Test domain fallback assert format_html_title( " ", url="http://www.example.com/hello.html") == "Example" # Test blacklist assert format_html_title( " home ", url="http://www.example.com/hello.html") == "Example" # Test OGP html = """<html> <head><meta property="og:title" content="Open graph title " /></head> <body>This is <body> text</body> </html>""" page = HTMLDocument(html).parse() assert format_title(page, {}) == "Open graph title"
def test_encoding_w3c(p_w3c_test, p_expected_encoding): from cosrlib.document.html import HTMLDocument test_file = os.path.join("tests/testdata/html_w3c_encoding_testcases", "the-input-byte-stream-%s.html" % p_w3c_test) with open(test_file, "rb") as f: headers = {} if os.path.isfile(test_file + ".headers"): with open(test_file + ".headers", "rb") as hf: headers["content-type"] = hf.read()[14:].strip() html = f.read() # print repr(html[0:10]) doc = HTMLDocument(html, url=None, headers=headers) if p_expected_encoding is None: assert doc.encoding.detect() is None else: assert doc.encoding.detect().name == p_expected_encoding doc.parse()
def _links(html, url=None): return HTMLDocument(html, url=url).parse().get_hyperlinks()
def _make_document(url, text, title=""): from cosrlib.document.html import HTMLDocument html = "<html><head><title>%s</title></head><body>%s</body></html>" % ( title, text) return HTMLDocument(html, url=url).parse()