Python htmlReadMemory Examples

Programming Language: Python

Namespace/Package Name: libxml2

Method/Function: htmlReadMemory

Examples at hotexamples.com: 4

Python htmlReadMemory - 4 examples found. These are the top rated real world Python examples of libxml2.htmlReadMemory extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: webgrep.py Project: inaz2/benri-tools

def webgrep(xpath, urls, is_recursive):
    root_url = urls[0]
    is_multiple_source = is_recursive or len(urls) > 1

    queue = [(url, None) for url in urls]
    visited = dict((url, True) for url in urls)

    while len(queue) > 0:
        (url, referrer) = queue.pop(0)

        req = urllib2.Request(url)
        if referrer:
            req.add_header('Referer', referrer)
        try:
            f = urllib2.urlopen(req)
            content_type = f.info().gettype()
            if not content_type in ('text/html', 'application/xhtml+xml'):
                f.close()
                continue
            content = f.read()
            f.close()
        except urllib2.URLError as e:
            print >> sys.stderr, "%s: %s" % (url, e)
            continue

        try:
            doc = libxml2.htmlReadMemory(
                content, len(content), url, None,
                libxml2.HTML_PARSE_RECOVER | libxml2.HTML_PARSE_NOERROR
                | libxml2.HTML_PARSE_NOWARNING | libxml2.HTML_PARSE_NONET)
            ctx = doc.xpathNewContext()
            for node in ctx.xpathEvalExpression(xpath):
                content = node.content.strip()
                if node.type == 'attribute' and node.name in ('href', 'src'):
                    content = urljoin(url, content)
                if is_multiple_source:
                    print "%s:%s" % (url, content)
                else:
                    print content
            if is_recursive:
                for node in ctx.xpathEvalExpression('//a/@href'):
                    (next_url, fragment) = urldefrag(
                        urljoin(url, node.content))
                    if next_url.startswith(
                            root_url) and not next_url in visited:
                        queue.append((next_url, url))
                        visited[next_url] = True
            ctx.xpathFreeContext()
            doc.freeDoc()
        except libxml2.treeError as e:
            print >> sys.stderr, "%s: %s" % (url, e)

Example #2

Show file

File: qq.py Project: zhuliting/code

def get_html_doc(html):
  if html is None:
    return
  if len(html) == 0:
    return
  pattern = '[\\x00-\\x08\\x0b-\\x0c\\x0e-\\x1f]'
  html = re.sub(pattern, ' ', html)
  # get encoding
  encoding = get_charset(html)
  if encoding is None:
    encoding = 'utf-8'

  options = libxml2.HTML_PARSE_RECOVER | libxml2.HTML_PARSE_NOERROR \
          | libxml2.HTML_PARSE_PEDANTIC | libxml2.HTML_PARSE_NONET \
          | libxml2.HTML_PARSE_NOWARNING
  doc = libxml2.htmlReadMemory(html, len(html), None, encoding, options)
  return doc

Example #3

Show file

def webgrep(xpath, urls, is_recursive):
    root_url = urls[0]
    is_multiple_source = is_recursive or len(urls) > 1

    queue = [(url, None) for url in urls]
    visited = dict((url, True) for url in urls)

    while len(queue) > 0:
        (url, referrer) = queue.pop(0)

        req = urllib2.Request(url)
        if referrer:
            req.add_header('Referer', referrer)
        try:
            f = urllib2.urlopen(req)
            content_type = f.info().gettype()
            if not content_type in ('text/html', 'application/xhtml+xml'):
                f.close()
                continue
            content = f.read()
            f.close()
        except urllib2.URLError as e:
            print >>sys.stderr, "%s: %s" % (url, e)
            continue

        try:
            doc = libxml2.htmlReadMemory(content, len(content), url, None, libxml2.HTML_PARSE_RECOVER | libxml2.HTML_PARSE_NOERROR | libxml2.HTML_PARSE_NOWARNING | libxml2.HTML_PARSE_NONET)
            ctx = doc.xpathNewContext()
            for node in ctx.xpathEvalExpression(xpath):
                content = node.content.strip()
                if node.type == 'attribute' and node.name in ('href', 'src'):
                    content = urljoin(url, content)
                if is_multiple_source:
                    print "%s:%s" % (url, content)
                else:
                    print content
            if is_recursive:
                for node in ctx.xpathEvalExpression('//a/@href'):
                    (next_url, fragment) = urldefrag(urljoin(url, node.content))
                    if next_url.startswith(root_url) and not next_url in visited:
                        queue.append((next_url, url))
                        visited[next_url] = True
            ctx.xpathFreeContext()
            doc.freeDoc()
        except libxml2.treeError as e:
            print >>sys.stderr, "%s: %s" % (url, e)

Example #4

Show file

def get_html_doc(html):
    if html is None:
        return
    if len(html) == 0:
        return
    pattern = '[\\x00-\\x08\\x0b-\\x0c\\x0e-\\x1f]'
    html = re.sub(pattern, ' ', html)
    # get encoding
    encoding = get_charset(html)
    if encoding is None:
        encoding = 'utf-8'

    options = libxml2.HTML_PARSE_RECOVER | libxml2.HTML_PARSE_NOERROR \
            | libxml2.HTML_PARSE_PEDANTIC | libxml2.HTML_PARSE_NONET \
            | libxml2.HTML_PARSE_NOWARNING
    doc = libxml2.htmlReadMemory(html, len(html), None, encoding, options)
    return doc