Python HTMLDocument.parse Examples

Programming Language: Python

Namespace/Package Name: cosrlib.document.html

Class/Type: HTMLDocument

Method/Function: parse

Examples at hotexamples.com: 12

Python HTMLDocument.parse - 12 examples found. These are the top rated real world Python examples of cosrlib.document.html.HTMLDocument.parse extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

HTMLDocument(23)

parse(6)

get_all_words(2)

get_hyperlinks(2)

get_title(2)

get_domain_paid_words(1)

get_external_hyperlinks(1)

get_internal_hyperlinks(1)

get_summary(1)

get_url(1)

get_url_words(1)

get_word_groups(1)

parse_canonical_url(1)

Example #1

Show file

File: test_encoding.py Project: JBaba/cosr-back

def test_encoding_w3c(p_w3c_test, p_expected_encoding):
    from cosrlib.document.html import HTMLDocument

    test_file = os.path.join(
        "tests/testdata/html_w3c_encoding_testcases",
        "the-input-byte-stream-%s.html" % p_w3c_test
    )

    with open(test_file, "rb") as f:
        headers = {}
        if os.path.isfile(test_file + ".headers"):
            with open(test_file + ".headers", "rb") as hf:
                headers["content-type"] = hf.read()[14:].strip()

        html = f.read()

        # print repr(html[0:10])

        doc = HTMLDocument(html, url=None, headers=headers)

        if p_expected_encoding is None:
            assert doc.encoding.detect() is None
        else:
            assert doc.encoding.detect().name == p_expected_encoding

        doc.parse()

Example #2

Show file

File: test_formatting.py Project: mlinksva/cosr-back

 def format_html_title(title, url=None):
     doc = HTMLDocument("""
         <html><head><title>%s</title></head><body>Hello</body></html>
     """ % title,
                        url=url)
     doc.parse()
     return format_title(doc, {})

Example #3

Show file

File: test_encoding.py Project: JBaba/cosr-back

def test_encoding_xml():
    from cosrlib.document.html import HTMLDocument

    doc = HTMLDocument("""<?xml version="1.0" encoding="shift_jis"?><!DOCTYPE html>
<html  lang="en" ></html>
    """)
    assert doc.encoding.detect().name == "shift_jis"
    doc.parse()

Example #4

Show file

File: test_encoding.py Project: x0rzkov/cosr-back

def test_encoding_x_user_defined():
    from cosrlib.document.html import HTMLDocument

    doc = HTMLDocument(
        """<html><head><meta charset="x-user-defined"></head><body>Hello</body></html>"""
    )
    assert doc.encoding.detect().name == "x-user-defined"
    doc.parse()

Example #5

Show file

File: test_encoding.py Project: x0rzkov/cosr-back

def test_encoding_xml():
    from cosrlib.document.html import HTMLDocument

    doc = HTMLDocument(
        """<?xml version="1.0" encoding="shift_jis"?><!DOCTYPE html>
<html  lang="en" ></html>
    """)
    assert doc.encoding.detect().name == "shift_jis"
    doc.parse()

Example #6

Show file

File: test_encoding.py Project: JBaba/cosr-back

def test_reparse():
    from cosrlib.document.html import HTMLDocument

    doc = HTMLDocument("""<html><head><meta charset="iso-8859-15"><title>Mac\xe9o</title></head></html>""")
    assert doc.encoding.detect().name == "iso8859-15"

    # A re-parsing of the document should be triggered, gumbo only accepts utf-8
    doc.parse()

    assert doc.get_title() == "Mac\xc3\xa9o"

Example #7

Show file

File: test_encoding.py Project: JBaba/cosr-back

def test_encoding_aliases():
    from cosrlib.document.html import HTMLDocument

    doc = HTMLDocument("""<html><head><meta charset="tis-620"></head><body>Hello</body></html>""")
    assert doc.encoding.detect().name == "cp874"
    doc.parse()

    doc = HTMLDocument("""<html><head><meta charset="windows-874"></head><body>Hello</body></html>""")
    assert doc.encoding.detect().name == "cp874"
    doc.parse()

Example #8

Show file

File: test_encoding.py Project: x0rzkov/cosr-back

def test_reparse():
    from cosrlib.document.html import HTMLDocument

    doc = HTMLDocument(
        """<html><head><meta charset="iso-8859-15"><title>Mac\xe9o</title></head></html>"""
    )
    assert doc.encoding.detect().name == "iso8859-15"

    # A re-parsing of the document should be triggered, gumbo only accepts utf-8
    doc.parse()

    assert doc.get_title() == "Mac\xc3\xa9o"

Example #9

Show file

File: test_encoding.py Project: x0rzkov/cosr-back

def test_encoding_aliases():
    from cosrlib.document.html import HTMLDocument

    doc = HTMLDocument(
        """<html><head><meta charset="tis-620"></head><body>Hello</body></html>"""
    )
    assert doc.encoding.detect().name == "cp874"
    doc.parse()

    doc = HTMLDocument(
        """<html><head><meta charset="windows-874"></head><body>Hello</body></html>"""
    )
    assert doc.encoding.detect().name == "cp874"
    doc.parse()

Example #10

Show file

File: test_encoding.py Project: x0rzkov/cosr-back

def test_encoding_w3c(p_w3c_test, p_expected_encoding):
    from cosrlib.document.html import HTMLDocument

    test_file = os.path.join("tests/testdata/html_w3c_encoding_testcases",
                             "the-input-byte-stream-%s.html" % p_w3c_test)

    with open(test_file, "rb") as f:
        headers = {}
        if os.path.isfile(test_file + ".headers"):
            with open(test_file + ".headers", "rb") as hf:
                headers["content-type"] = hf.read()[14:].strip()

        html = f.read()

        # print repr(html[0:10])

        doc = HTMLDocument(html, url=None, headers=headers)

        if p_expected_encoding is None:
            assert doc.encoding.detect() is None
        else:
            assert doc.encoding.detect().name == p_expected_encoding

        doc.parse()

Example #11

Show file

File: test_encoding.py Project: JBaba/cosr-back

def test_encoding_x_user_defined():
    from cosrlib.document.html import HTMLDocument

    doc = HTMLDocument("""<html><head><meta charset="x-user-defined"></head><body>Hello</body></html>""")
    assert doc.encoding.detect().name == "x-user-defined"
    doc.parse()

Example #12

Show file

File: test_formatting.py Project: bakztfuture/cosr-back

 def format_html_title(title, url=None):
     doc = HTMLDocument("""
         <html><head><meta charset="UTF-8"><title>%s</title></head><body>Hello</body></html>
     """ % title, url=url)
     doc.parse()
     return format_title(doc, {})