Python document_fromstringの例、lxml.html.html5parser.document_fromstring Pythonの例

コード例 #1

0

ファイルを表示

ファイル: htmlconvert.py プロジェクト: mseag/rac-dictionary-conversion

def convert_file(fname):
    out_fname = fname.replace('.mht', '.txt')
    with open(fname, "rb") as f:
        e = email.message_from_binary_file(f)

    htmlText = ""
    for part in e.walk():
        if part.get_content_type() == "text/html":
            htmlText = part.get_payload()
            break

    if not htmlText:
        print("Could not find HTML text in {}".format(fname))
        sys.exit(2)

    b = quopri.decodestring(htmlText.encode('utf-8'))
    content = b.decode('utf-8')
    doc = html5parser.document_fromstring(content)
    body = doc[1]
    rootdiv = body[-1]

    with open(out_fname, "w") as out_f:
        for p in rootdiv:
            text = text_of(p)
            if re.match(r'\s', text) or is_indented(p):
                out_f.write("\t{}\n".format(text.strip()))
            else:
                out_f.write("{}\n".format(text.strip()))

コード例 #2

0

ファイルを表示

ファイル: utils.py プロジェクト: containerz/talon

def html_document_fromstring(s):
    """Parse html tree from string. Return None if the string can't be parsed.
    """
    try:
        return html5parser.document_fromstring(s, parser=_html5lib_parser())
    except Exception:
        pass

コード例 #3

0

ファイルを表示

def html_document_fromstring(s):
    """Parse html tree from string. Return None if the string can't be parsed.
    """
    if isinstance(s, six.text_type):
        s = s.encode('utf8')
    try:
        #if html_too_big(s):
        #    return None

        return html5parser.document_fromstring(s, parser=_html5lib_parser())
    except Exception:
        pass

コード例 #4

0

ファイルを表示

ファイル: utils.py プロジェクト: guruhq/talon

def html_document_fromstring(s):
    """Parse html tree from string. Return None if the string can't be parsed.
    """
    if isinstance(s, six.text_type):
        s = s.encode('utf8')
    try:
        if html_too_big(s):
            return None

        return html5parser.document_fromstring(s, parser=_html5lib_parser())
    except Exception:
        pass

コード例 #5

0

ファイルを表示

ファイル: appcache_middleware.py プロジェクト: Le-Mayzeur/django-html5-appcache

 def process_response(self, request, response):
     """
     This method is called only if ``appcache_analyze`` parameter is attached
     to the querystring, to avoid overhead during normal navigation
     """
     if (response['Content-Type'].find("text/html")>-1 and
             request.GET.get("appcache_analyze", False)):
         lxdoc = document_fromstring(response.content)
         self.walk_tree(lxdoc)
         response.appcache = {'cached': self._cached,
                              'fallback': self._fallback,
                              'network': self._network}
     return response

コード例 #6

0

ファイルを表示

ファイル: find.py プロジェクト: tumregels/ankle

def find_iter(skeleton, document):
    """
    Return an iterator that yields elements from the document that
    match given skeleton.

    See `find_all` for details.
    """
    if is_string(document):
        document = html5parser.document_fromstring(document)
    if is_string(skeleton):
        skeleton = html5parser.fragment_fromstring(skeleton)

    for element in document.iterdescendants():
        if node_matches_bone(element, skeleton):
            yield element

コード例 #7

0

ファイルを表示

ファイル: test_views.py プロジェクト: kcunning/balt

    def test_index_shows_all_tutorials(self):
        '''The Tutorial index should show all tutorials'''

        expected = ["A-{}".format(uuid.uuid4()),
                    "B-{}".format(uuid.uuid4()),
                   ]
        for e in expected:
            models.Tutorial.objects.create(title=e, fpid=e)

        resp = self.client.get(reverse('index'))
        html5 = html5parser.document_fromstring(resp.content)
        nodes = document_fromstring(etree.tostring(html5))

        titles = [n.text for n in nodes.cssselect('.t-tutorial-title')]
        self.assertEqual(expected, titles)

コード例 #8

0

ファイルを表示

def html_document_fromstring(s):
    """Parse html tree from string. Return None if the string can't be parsed.
    """
    # html5lib doesn't allow us to pass encodings to the parser because reasons, so it defaults wo Windows-1252
    # The decoding afterwards is done in UTF-8, and while this works fine for english alphabet characters,
    # other characters are mis-decoded
    # We encode the string to unicode so that it goes through the unicode flow and all is well in the world
    s = unicode(s)
    try:
        if html_too_big(s):
            return None

        return html5parser.document_fromstring(s, parser=_html5lib_parser())
    except Exception:
        pass

コード例 #9

0

ファイルを表示

ファイル: html_utils.py プロジェクト: janderse/juriscraper

def get_html5_parsed_text(text):
    """Return content using the html5parser, ideal for faulty html.

    This dance is slightly different than usual because it uses the
    html5parser to first create an _Element object, then serialize it using
    `tostring`, then parse *that* using the usual fromstring function. The
    end result is that irregularities in the html are fixed by the
    html5parser, and the usual lxml parser gives us the same API we are
    used to.

    :param text: The html of the document
    :return: an lxml.HtmlElement object
    """
    parsed = html5parser.document_fromstring(text.encode('utf-8'))
    return fromstring(tostring(parsed, encoding='unicode'))

コード例 #10

0

ファイルを表示

 def process_response(self, request, response):
     """
     This method is called only if ``appcache_analyze`` parameter is attached
     to the querystring, to avoid overhead during normal navigation
     """
     if (response['Content-Type'].find("text/html") > -1
             and request.GET.get("appcache_analyze", False)):
         lxdoc = document_fromstring(response.content)
         self.walk_tree(lxdoc)
         response.appcache = {
             'cached': self._cached,
             'fallback': self._fallback,
             'network': self._network
         }
     return response

コード例 #11

0

ファイルを表示

def get_html5_parsed_text(text):
    """Return content using the html5parser, ideal for faulty html.

    This dance is slightly different than usual because it uses the
    html5parser to first create an _Element object, then serialize it using
    `tostring`, then parse *that* using the usual fromstring function. The
    end result is that irregularities in the html are fixed by the
    html5parser, and the usual lxml parser gives us the same API we are
    used to.

    :param text: The html of the document
    :return: an lxml.HtmlElement object
    """
    parsed = html5parser.document_fromstring(text.encode("utf-8"))
    return fromstring(tostring(parsed, encoding="unicode"))

コード例 #12

0

ファイルを表示

    def test_index_shows_all_tutorials(self):
        '''The Tutorial index should show all tutorials'''

        expected = [
            "A-{}".format(uuid.uuid4()),
            "B-{}".format(uuid.uuid4()),
        ]
        for e in expected:
            models.Tutorial.objects.create(title=e, fpid=e)

        resp = self.client.get(reverse('index'))
        html5 = html5parser.document_fromstring(resp.content)
        nodes = document_fromstring(etree.tostring(html5))

        titles = [n.text for n in nodes.cssselect('.t-tutorial-title')]
        self.assertEqual(expected, titles)

コード例 #13

0

ファイルを表示

ファイル: ca11_u.py プロジェクト: m4h7/juriscraper

    def _make_html_tree(self, text):
        """ Grab the content using the html5parser.

        This dance is slightly different than usual because it uses the
        html5parser to first create an _Element object, then serialize it using
        `tostring`, then parse *that* using the usual fromstring function. The
        end result is that irregularities in the html are fixed by the
        html5parser, and the usual lxml parser gives us the same API we are
        used to.

        :param text: The html of the document
        :return: an lxml.HtmlElement object
        """
        e = html5parser.document_fromstring(text)
        html_tree = fromstring(tostring(e))
        return html_tree

コード例 #14

0

ファイルを表示

ファイル: ca11_u.py プロジェクト: uglyboxer/juriscraper

    def _make_html_tree(self, text):
        """ Grab the content using the html5parser.

        This dance is slightly different than usual because it uses the
        html5parser to first create an _Element object, then serialize it using
        `tostring`, then parse *that* using the usual fromstring function. The
        end result is that irregularities in the html are fixed by the
        html5parser, and the usual lxml parser gives us the same API we are
        used to.

        :param text: The html of the document
        :return: an lxml.HtmlElement object
        """
        e = html5parser.document_fromstring(text)
        html_tree = fromstring(tostring(e))
        return html_tree

コード例 #15

0

ファイルを表示

ファイル: crawl.py プロジェクト: dpk/untangle

 def crawl(self):
   tocrawl = self.urls
   self.visited = set()
   while not len(tocrawl) == 0:
     toadd = set()
     for url in tocrawl:
       self.visited.add(url)
       resp, content = http.request(url, 'GET')
       if resp['status'] == '404' or 'text/html' not in resp['content-type']:
         continue
       
       self.page_handler(resp, content)
       h = html5.document_fromstring(content, guess_charset=False)
       for link in links.links(h, url=url):
         if self.isurlvalid(link.dest):
           toadd.add(link.dest[:link.dest.rindex('#')] if '#' in link.dest else link.dest)
         self.link_handler(link)
     
     for url in toadd: self.urls.add(url)
     tocrawl = (self.urls ^ self.visited)

コード例 #16

0

ファイルを表示

ファイル: htmls.py プロジェクト: ZoeyYoung/python-readability

def build_doc(page):
    """解析HTML
    @para page: 爬取的页面
    @return <class 'lxml.html.HtmlElement'> 类型对象
    """
    # Requires that the `page` not be None
    if page is None:
        LOG.error("Page content is None, can't build_doc")
        return ''
    html5doc = html5parser.document_fromstring(decode_html(page))
    doc = fragment_fromstring(tostring(html5doc))
    # doc = document_fromstring(decode_html(page))
    output = open('parse.txt', 'w')
    print(tostring(doc), file=output)
    # try:
    #     tostring(doc, encoding='unicode')
    # except UnicodeDecodeError:
    #     """Using soupparser as a fallback
    #     """
    #     print("Using soupparser as a fallback")
    #     doc = soupparser.fromstring(decode_html(page))
    return doc

コード例 #17

0

ファイルを表示

ファイル: __init__.py プロジェクト: zanachka/webstruct-demo

def extract_ner(response_content, response_url, base_url):
    url = response_url
    tree = html5parser.document_fromstring(response_content)
    tree = remove_namespace(tree)
    tree = absolute_links(tree, url)
    tree = parent_links(tree, base_url)

    title = tree.xpath('//title')[0].text

    model = joblib.load(webstruct_demo.config['MODEL_PATH'])
    tree, tokens, tags = run_model(tree, model)
    tree = model.html_tokenizer.detokenize_single(tokens, tags)
    tree = webstruct.webannotator.to_webannotator(
        tree, entity_colors=model.entity_colors, url=url)
    content = lxml.html.tostring(tree, encoding='utf-8').decode('utf-8')
    entities = webstruct.sequence_encoding.IobEncoder.group(zip(tokens, tags))
    entities = webstruct.model._drop_empty((model.build_entity(tokens), tag)
                                           for (tokens, tag) in entities
                                           if tag != 'O')
    groups = webstruct.model.extract_entitiy_groups(
        tokens, tags, dont_penalize=None, join_tokens=model.build_entity)

    return content, title, entities, groups

コード例 #18

0

ファイルを表示

ファイル: htmls.py プロジェクト: lantian316/Bookmarks_Cloud

def build_doc(page):
    """解析HTML
    @para page: 爬取的页面
    @return <class 'lxml.html.HtmlElement'> 类型对象
    """
    # Requires that the `page` not be None
    if page is None:
        LOG.error("Page content is None, can't build_doc")
        return ''
    if not isinstance(page, (str, bytes)):
        page = str(page)
    html5doc = html5parser.document_fromstring(decode_html(page))
    doc = fragment_fromstring(tostring(html5doc))
    # doc = document_fromstring(decode_html(page))
    output = open('parse.txt', 'w')
    print(tostring(doc), file=output)
    # try:
    #     tostring(doc, encoding='unicode')
    # except UnicodeDecodeError:
    #     """Using soupparser as a fallback
    #     """
    #     print("Using soupparser as a fallback")
    #     doc = soupparser.fromstring(decode_html(page))
    return doc

コード例 #19

0

ファイルを表示

ファイル: html5lib-vs-clean.py プロジェクト: kwinkunks/Objavi

#!/usr/bin/python

import lxml.html, lxml.html.clean
from lxml.html import html5parser

html = """<html><body>
<!-- a comment -->
</body></html>
"""

#tree = lxml.html.document_fromstring(html)
tree = html5parser.document_fromstring(html)

cleaner = lxml.html.clean.Cleaner()
cleaner(tree)

コード例 #20

0

ファイルを表示

ファイル: lxml_parse.py プロジェクト: andynicholson/Objavi


</body></html>
"""


html ="""
<html><body>
<!-- a comment -->
</body></html>
"""

import lxml, lxml.html, lxml.html.clean
from lxml.html import html5parser
tree = lxml.html.document_fromstring(html)
tree = html5parser.document_fromstring(html)

cleaner = lxml.html.clean.Cleaner(scripts=True,
                                  javascript=True,
                                  comments=False,
                                  style=True,
                                  links=True,
                                  meta=True,
                                  page_structure=False,
                                  processing_instructions=True,
                                  embedded=True,
                                  frames=True,
                                  forms=True,
                                  annoying_tags=True,
                                  #allow_tags=OK_TAGS,
                                  remove_unknown_tags=True,

コード例 #21

0

ファイルを表示

ファイル: ny.py プロジェクト: m4h7/juriscraper

 def _make_html_tree(self, text):
     e = html5parser.document_fromstring(text)
     html_tree = fromstring(tostring(e))
     return html_tree

コード例 #22

0

ファイルを表示

 def _make_html_tree(self, text):
     e = html5parser.document_fromstring(text)
     html_tree = fromstring(tostring(e))
     return html_tree

コード例 #23

0

ファイルを表示

from lxml.html import fragment_fromstring
from lxml.html import tostring
html = """
<html>
<head>
</head>
<body>
    Text
    <h1>Title</h1>
    Tail1
    <div>p1<p>inner text</p></div>
    <p>p2</p>
    Tail2
</body>
"""
html5doc = html5parser.document_fromstring(html, guess_charset=False)
root = fragment_fromstring(tostring(html5doc))
for elem in root.findall(".//body"):
    # 1. 为Text包装<p>, 即将Text变成<p>Text</p>
    if elem.text and elem.text.strip():
        p = fragment_fromstring('<p/>')
        p.text = elem.text
        elem.text = None
        elem.insert(0, p)
    # 2. 为body的子元素包装div
    div = fragment_fromstring("<div/>")
    for e in elem.iterchildren():
        print(e, e.text)
        div.append(e)
        print(tostring(div))
    elem.insert(0, div)

コード例 #24

0

ファイルを表示

 def call_it(self, *args, **kwargs):
     from lxml.html.html5parser import document_fromstring
     return document_fromstring(*args, **kwargs)

コード例 #25

0

ファイルを表示

ファイル: test_html5parser.py プロジェクト: dairiki/lxml

 def call_it(self, *args, **kwargs):
     from lxml.html.html5parser import document_fromstring
     return document_fromstring(*args, **kwargs)