Python HTMLDocument.HTMLDocumentの例、cosrlib.document.html.HTMLDocument.HTMLDocument Pythonの例

コード例 #1

0

ファイルを表示

def test_get_title():
    assert HTMLDocument(
        """<html><head><title>Test title</title></head><body>x</body></html>"""
    ).parse().get_title() == "Test title"

    assert HTMLDocument("""<html><title>Test title</title>XX</html>""").parse(
    ).get_title() == "Test title"

    assert HTMLDocument(
        """<html><head><title>Test title</title></head><body><title>x</title></body></html>"""
    ).parse().get_title() == "Test title"

コード例 #2

0

ファイルを表示

ファイル: test_base.py プロジェクト: x0rzkov/cosr-back

def test_get_url_words():

    doc = HTMLDocument("", url="http://www.nytimes.com/2011/10/06/arts/music/maceo-parker.html?print=true#hash").parse()
    assert doc.get_url_words() == [
        "nytimes", "com", "2011", "10", "06", "arts", "music", "maceo", "parker", "html"
    ]

    doc = HTMLDocument("", url="https://en.wikipedia.org/wiki/Nine_Inch_Nails").parse()
    assert doc.get_url_words() == [
        "en", "wikipedia", "org", "wiki", "nine", "inch", "nails"
    ]

コード例 #3

0

ファイルを表示

ファイル: test_base.py プロジェクト: x0rzkov/cosr-back

def test_get_canonical_url():

    html = """<html><head></head><body>x</body></html>"""
    page = HTMLDocument(html, url="http://example.com/page.html").parse()
    assert page.parse_canonical_url() is None

    html = """<html><head><link rel="canonical" href="" /></head><body>x</body></html>"""
    page = HTMLDocument(html, url="http://example.com/page.html").parse()
    assert page.parse_canonical_url() is None

    html = """<html><head><link rel="canonical" href="http://example.com/page2.html" /></head><body>x</body></html>"""
    page = HTMLDocument(html, url="http://example.com/page.html").parse()
    assert page.parse_canonical_url().url == "http://example.com/page2.html"

    html = """<html><head><linK reL="caNonical" hreF="http://example.com/page2.html" /></head><body>x</body></html>"""
    page = HTMLDocument(html, url="http://example.com/page.html").parse()
    assert page.parse_canonical_url().url == "http://example.com/page2.html"

    # Cross domain blocked for now
    html = """<html><head><linK reL="caNonical" hreF="http://example2.com/page2.html" /></head><body>x</body></html>"""
    page = HTMLDocument(html, url="http://example.com/page.html").parse()
    assert page.parse_canonical_url() is None

    # Relative URLs
    html = """<html><head><linK reL="caNonical" hreF="/dir2/page2.html" /></head><body>x</body></html>"""
    page = HTMLDocument(html, url="http://example.com/dir/page.html").parse()
    assert page.parse_canonical_url().url == "http://example.com/dir2/page2.html"

    html = """<html><head><linK reL="caNonical" hreF="dir2/page2.html" /></head><body>x</body></html>"""
    page = HTMLDocument(html, url="http://example.com/dir/page.html").parse()
    assert page.parse_canonical_url().url == "http://example.com/dir/dir2/page2.html"

    html = """<html><head><linK reL="caNonical" hreF="//example.com/dir2/page2.html" /></head><body>x</body></html>"""
    page = HTMLDocument(html, url="http://example.com/dir/page.html").parse()
    assert page.parse_canonical_url().url == "http://example.com/dir2/page2.html"

コード例 #4

0

ファイルを表示

ファイル: test_encoding.py プロジェクト: x0rzkov/cosr-back

def test_encoding_aliases():
    from cosrlib.document.html import HTMLDocument

    doc = HTMLDocument(
        """<html><head><meta charset="tis-620"></head><body>Hello</body></html>"""
    )
    assert doc.encoding.detect().name == "cp874"
    doc.parse()

    doc = HTMLDocument(
        """<html><head><meta charset="windows-874"></head><body>Hello</body></html>"""
    )
    assert doc.encoding.detect().name == "cp874"
    doc.parse()

コード例 #5

0

ファイルを表示

ファイル: test_formatting.py プロジェクト: mlinksva/cosr-back

 def format_html_title(title, url=None):
     doc = HTMLDocument("""
         <html><head><title>%s</title></head><body>Hello</body></html>
     """ % title,
                        url=url)
     doc.parse()
     return format_title(doc, {})

コード例 #6

0

ファイルを表示

def test_parsing_samples(sample_name):
    metadata = SAMPLES[sample_name]

    sample_file = "tests/testdata/html_page_samples/%s" % sample_name
    with open(sample_file, "r") as f:
        html = f.read()

        page = HTMLDocument(html).parse()

        if "title" in metadata:
            assert metadata["title"] == page.get_title()

        if "summary" in metadata:
            assert metadata["summary"] == page.get_summary()

        # for k, g in sorted(page.get_word_groups().items()):
        #   print k, g

        words = page.get_all_words()
        lower_words_set = set([w.lower() for w in words])

        # Uncomment this to debug
        if metadata.get("debug"):
            print words

        for word in metadata.get("assert_words_missing", []):
            assert word not in lower_words_set

        for word in metadata.get("assert_words", []):
            assert word in lower_words_set

コード例 #7

0

ファイルを表示

def test_get_url():

    # When none is given, we take the URL
    html = """<html><head></head><body>x</body></html>"""
    page = HTMLDocument(html, url="http://example.com/page.html").parse()
    assert page.get_url().url == "http://example.com/page.html"

    # But when a tag is present, it has precedence
    html = """<html><head><link rel="canonical" href="http://example.com/page2.html" /></head><body>x</body></html>"""
    page = HTMLDocument(html, url="http://example.com/page.html").parse()
    assert page.get_url().url == "http://example.com/page2.html"

    # Including with strange caps
    html = """<htmL><heaD><linK reL="CANonical" hreF="http://example.com/Page2.html" /></head><body>x</body></html>"""
    page = HTMLDocument(html, url="http://example.com/page.html").parse()
    assert page.get_url().url == "http://example.com/Page2.html"

コード例 #8

0

ファイルを表示

def test_get_domain_paid_words():

    doc = HTMLDocument(
        "",
        url=
        "http://www.bbc.co.uk/2011/10/06/arts/music/maceo-parker.html?print=true"
    )
    assert doc.get_domain_paid_words() == ["bbc"]

コード例 #9

0

ファイルを表示

ファイル: test_encoding.py プロジェクト: x0rzkov/cosr-back

def test_encoding_x_user_defined():
    from cosrlib.document.html import HTMLDocument

    doc = HTMLDocument(
        """<html><head><meta charset="x-user-defined"></head><body>Hello</body></html>"""
    )
    assert doc.encoding.detect().name == "x-user-defined"
    doc.parse()

コード例 #10

0

ファイルを表示

ファイル: test_encoding.py プロジェクト: x0rzkov/cosr-back

def test_encoding_xml():
    from cosrlib.document.html import HTMLDocument

    doc = HTMLDocument(
        """<?xml version="1.0" encoding="shift_jis"?><!DOCTYPE html>
<html  lang="en" ></html>
    """)
    assert doc.encoding.detect().name == "shift_jis"
    doc.parse()

コード例 #11

0

ファイルを表示

def test_get_hyperlinks():

    # When none is given, we take the URL
    html = """<html><head></head><body>before <a href="http://example.com/page1">link text</a> after</body></html>"""
    page = HTMLDocument(html, url="http://example.com/page.html").parse()

    links = page.get_hyperlinks()
    assert len(links) == 1
    assert links[0]["href"].url == "http://example.com/page1"
    assert links[0]["words"] == ["link", "text"]

コード例 #12

0

ファイルを表示

ファイル: test_samples.py プロジェクト: x0rzkov/cosr-back

def _test_mozilla_readability_testcases(test_name):
    with open("%s/%s/source.html" %
              (MOZILLA_READABILITY_DIR, test_name)) as fhtml:
        with open("%s/%s/expected.html" %
                  (MOZILLA_READABILITY_DIR, test_name)) as fsimple:
            html = fhtml.read()
            simple = fsimple.read()

            doc = HTMLDocument(html).parse()
            doc_simple = HTMLDocument(simple).parse()

            doc_word_groups = []
            for wg in doc.analysis["word_groups"]:
                doc_word_groups.append(wg[0])

            doc_simple_word_groups = []
            for wg in doc_simple.analysis["word_groups"]:
                doc_simple_word_groups.append(wg[0])

            assert doc_word_groups == doc_simple_word_groups

コード例 #13

0

ファイルを表示

ファイル: test_word_groups.py プロジェクト: rahulchhangani/cosr-back

def test_get_word_groups(sample):

    page = HTMLDocument(sample["html"]).parse()

    word_groups = page.get_word_groups()

    for i, group in enumerate(word_groups):
        sample_group = sample["groups"][i]
        assert group["tag"] == sample_group[0]
        assert group["words"] == sample_group[1]

    assert len(word_groups) == len(sample["groups"])

コード例 #14

0

ファイルを表示

ファイル: test_encoding.py プロジェクト: x0rzkov/cosr-back

def test_reparse():
    from cosrlib.document.html import HTMLDocument

    doc = HTMLDocument(
        """<html><head><meta charset="iso-8859-15"><title>Mac\xe9o</title></head></html>"""
    )
    assert doc.encoding.detect().name == "iso8859-15"

    # A re-parsing of the document should be triggered, gumbo only accepts utf-8
    doc.parse()

    assert doc.get_title() == "Mac\xc3\xa9o"

コード例 #15

0

ファイルを表示

ファイル: test_formatting.py プロジェクト: mlinksva/cosr-back

def test_format_summary():

    html = """<html>
        <head><meta name="Description" content=" This   is a &lt;summary&gt;!" /></head>
        <body>This is &lt;body&gt; text</body>
    </html>"""

    page = HTMLDocument(html).parse()
    assert format_summary(page, {}) == "This is a <summary>!"

    html = """<html>
        <head><meta property="og:description" content=" This   is a &lt;summary&gt;!" /></head>
        <body>This is &lt;body&gt; text</body>
    </html>"""

    page = HTMLDocument(html).parse()
    assert format_summary(page, {}) == "This is a <summary>!"

    html = """<html>
        <head><meta name="Description" content="" /></head>
        <body> <div>This is &lt;body&gt; text, very detailed, very long xxxxxxxxx! </div></body>
    </html>"""

    page = HTMLDocument(html).parse()
    assert format_summary(
        page, {}) == "This is <body> text, very detailed, very long xxxxxxxxx!"

    html = """<html>
        <head><meta name="Description" content="" /></head>
        <body>
            <div>This is &lt;body&gt; text, very detailed, very long xxxxxxxxx! </div>
            <h1>But there is a more informative title! Use it</h1>
        </body>
    </html>"""

    page = HTMLDocument(html).parse()
    assert format_summary(
        page, {}) == "But there is a more informative title! Use it"

コード例 #16

0

ファイルを表示

ファイル: test_samples.py プロジェクト: x0rzkov/cosr-back

def _test_newspaper_testcases(test_name):
    with open("%s/html/%s" % (NEWSPAPER_DIR, test_name)) as fhtml:
        with open("%s/text/%s" %
                  (NEWSPAPER_DIR, test_name.replace(".html", ".txt"))) as ftxt:
            html = fhtml.read()
            txt = ftxt.read()
            word_groups = txt.split("\n\n")

            doc = HTMLDocument(html).parse()

            doc_word_groups = []
            for wg in doc.analysis["word_groups"]:
                doc_word_groups.append(wg[0])

            assert doc_word_groups == word_groups

コード例 #17

0

ファイルを表示

def test_hidden_text():

    html = """<html><head></head><body>
        <script> hello(); </script>
        <style> style { good } </style>
        <!-- comment -->
        text
        <p>p</p>
        <div style='display: none;'>hidden by display</div>
        <div hidden>hidden by html5 attribute</div>
        <div aria-hidden="true">hidden by aria</div>
        <div aria-hidden="false">not_aria</div>
        <div style='visibility: hidden;'>hidden by visibility</div>
    </body></html>"""
    page = HTMLDocument(html).parse()

    assert page.get_all_words() == set(["text", "p", "not_aria"])

コード例 #18

0

ファイルを表示

ファイル: test_base.py プロジェクト: x0rzkov/cosr-back

def test_get_hyperlinks():

    html = """<html><head></head><body>
        before
        <a href="http://example.com/page1">link text</a>
        after

        <a href="/page2">relative2</a>
        <a href="page3?q=1#d">relative3</a>
        <a href="http://other.example.com/page4">absolute4</a>
        <a href="//other.example.com/page5?q=1#d">absolute5</a>
        <a href="https://other.example.com/page6?q=1#d">absolute6</a>
        <a href="javascript:func()">js1</a>

        </body></html>"""
    page = HTMLDocument(html, url="http://example.com/page.html").parse()

    links = page.get_external_hyperlinks()
    assert len(links) == 3
    assert links[0]["href"].url == "http://other.example.com/page4"
    assert links[0]["text"] == "absolute4"

    assert links[1]["href"].url == "http://other.example.com/page5?q=1#d"
    assert links[1]["text"] == "absolute5"

    assert links[2]["href"].url == "https://other.example.com/page6?q=1#d"
    assert links[2]["text"] == "absolute6"

    # This doesn't return URLs, it returns strings (they are paths)
    links = page.get_internal_hyperlinks()
    assert len(links) == 3
    assert links[0]["path"] == "/page1"
    assert links[0]["text"] == "link text"

    assert links[1]["path"] == "/page2"
    assert links[1]["text"] == "relative2"

    assert links[2]["path"] == "page3?q=1#d"
    assert links[2]["text"] == "relative3"

    # All links in absolute
    links = page.get_hyperlinks()
    assert len(links) == 6
    assert links[2]["href"].url == "http://example.com/page3?q=1#d"

コード例 #19

0

ファイルを表示

ファイル: test_formatting.py プロジェクト: x0rzkov/cosr-back

def test_format_title():
    def format_html_title(title, url=None):
        doc = HTMLDocument("""
            <html><head><meta charset="UTF-8"><title>%s</title></head><body>Hello</body></html>
        """ % title,
                           url=url)
        doc.parse()
        return format_title(doc, {})

    assert format_html_title("A Title!") == "A Title!"
    assert format_html_title("  A  \n Title\t \t!  ") == "A Title !"
    assert format_html_title("a" * 100) == ("a" * 70) + "..."
    #
    # Test that emoji chararacters and symbols are removed from titles
    emoji_title = u"ðŸ˜‹  Super Emoji-Land.com  " ""
    emoji_title = emoji_title.encode('utf8')
    emoji_title = format_html_title(emoji_title)
    assert emoji_title == "Super Emoji-Land.com"

    assert format_html_title(("a" * 60) +
                             " 2345678 1234567") == ("a" * 60) + " 2345678..."
    assert format_html_title(("a" * 60) +
                             " 234567890 1234567") == ("a" *
                                                       60) + " 234567890..."
    assert format_html_title(("a" * 60) +
                             " 2345678901 1234567") == ("a" * 60) + "..."

    # Test domain fallback
    assert format_html_title(
        "  ", url="http://www.example.com/hello.html") == "Example"

    # Test blacklist
    assert format_html_title(
        "  home ", url="http://www.example.com/hello.html") == "Example"

    # Test OGP
    html = """<html>
        <head><meta property="og:title" content="Open graph title  " /></head>
        <body>This is &lt;body&gt; text</body>
    </html>"""

    page = HTMLDocument(html).parse()
    assert format_title(page, {}) == "Open graph title"

コード例 #20

0

ファイルを表示

ファイル: test_formatting.py プロジェクト: mlinksva/cosr-back

def test_format_title():
    def format_html_title(title, url=None):
        doc = HTMLDocument("""
            <html><head><title>%s</title></head><body>Hello</body></html>
        """ % title,
                           url=url)
        doc.parse()
        return format_title(doc, {})

    assert format_html_title("A Title!") == "A Title!"
    assert format_html_title("  A  \n Title\t \t!  ") == "A Title !"
    assert format_html_title("a" * 100) == ("a" * 70) + "..."

    assert format_html_title(("a" * 60) +
                             " 2345678 1234567") == ("a" * 60) + " 2345678..."
    assert format_html_title(("a" * 60) +
                             " 234567890 1234567") == ("a" *
                                                       60) + " 234567890..."
    assert format_html_title(("a" * 60) +
                             " 2345678901 1234567") == ("a" * 60) + "..."

    # Test domain fallback
    assert format_html_title(
        "  ", url="http://www.example.com/hello.html") == "Example"

    # Test blacklist
    assert format_html_title(
        "  home ", url="http://www.example.com/hello.html") == "Example"

    # Test OGP
    html = """<html>
        <head><meta property="og:title" content="Open graph title  " /></head>
        <body>This is &lt;body&gt; text</body>
    </html>"""

    page = HTMLDocument(html).parse()
    assert format_title(page, {}) == "Open graph title"

コード例 #21

0

ファイルを表示

ファイル: test_encoding.py プロジェクト: x0rzkov/cosr-back

def test_encoding_w3c(p_w3c_test, p_expected_encoding):
    from cosrlib.document.html import HTMLDocument

    test_file = os.path.join("tests/testdata/html_w3c_encoding_testcases",
                             "the-input-byte-stream-%s.html" % p_w3c_test)

    with open(test_file, "rb") as f:
        headers = {}
        if os.path.isfile(test_file + ".headers"):
            with open(test_file + ".headers", "rb") as hf:
                headers["content-type"] = hf.read()[14:].strip()

        html = f.read()

        # print repr(html[0:10])

        doc = HTMLDocument(html, url=None, headers=headers)

        if p_expected_encoding is None:
            assert doc.encoding.detect() is None
        else:
            assert doc.encoding.detect().name == p_expected_encoding

        doc.parse()

コード例 #22

0

ファイルを表示

ファイル: test_hyperlinks.py プロジェクト: rahulchhangani/cosr-back

def _links(html, url=None):
    return HTMLDocument(html, url=url).parse().get_hyperlinks()

コード例 #23

0

ファイルを表示

ファイル: test_language.py プロジェクト: x0rzkov/cosr-back

def _make_document(url, text, title=""):
    from cosrlib.document.html import HTMLDocument
    html = "<html><head><title>%s</title></head><body>%s</body></html>" % (
        title, text)
    return HTMLDocument(html, url=url).parse()