def webgrep(xpath, urls, is_recursive): root_url = urls[0] is_multiple_source = is_recursive or len(urls) > 1 queue = [(url, None) for url in urls] visited = dict((url, True) for url in urls) while len(queue) > 0: (url, referrer) = queue.pop(0) req = urllib2.Request(url) if referrer: req.add_header('Referer', referrer) try: f = urllib2.urlopen(req) content_type = f.info().gettype() if not content_type in ('text/html', 'application/xhtml+xml'): f.close() continue content = f.read() f.close() except urllib2.URLError as e: print >> sys.stderr, "%s: %s" % (url, e) continue try: doc = libxml2.htmlReadMemory( content, len(content), url, None, libxml2.HTML_PARSE_RECOVER | libxml2.HTML_PARSE_NOERROR | libxml2.HTML_PARSE_NOWARNING | libxml2.HTML_PARSE_NONET) ctx = doc.xpathNewContext() for node in ctx.xpathEvalExpression(xpath): content = node.content.strip() if node.type == 'attribute' and node.name in ('href', 'src'): content = urljoin(url, content) if is_multiple_source: print "%s:%s" % (url, content) else: print content if is_recursive: for node in ctx.xpathEvalExpression('//a/@href'): (next_url, fragment) = urldefrag( urljoin(url, node.content)) if next_url.startswith( root_url) and not next_url in visited: queue.append((next_url, url)) visited[next_url] = True ctx.xpathFreeContext() doc.freeDoc() except libxml2.treeError as e: print >> sys.stderr, "%s: %s" % (url, e)
def get_html_doc(html): if html is None: return if len(html) == 0: return pattern = '[\\x00-\\x08\\x0b-\\x0c\\x0e-\\x1f]' html = re.sub(pattern, ' ', html) # get encoding encoding = get_charset(html) if encoding is None: encoding = 'utf-8' options = libxml2.HTML_PARSE_RECOVER | libxml2.HTML_PARSE_NOERROR \ | libxml2.HTML_PARSE_PEDANTIC | libxml2.HTML_PARSE_NONET \ | libxml2.HTML_PARSE_NOWARNING doc = libxml2.htmlReadMemory(html, len(html), None, encoding, options) return doc
def webgrep(xpath, urls, is_recursive): root_url = urls[0] is_multiple_source = is_recursive or len(urls) > 1 queue = [(url, None) for url in urls] visited = dict((url, True) for url in urls) while len(queue) > 0: (url, referrer) = queue.pop(0) req = urllib2.Request(url) if referrer: req.add_header('Referer', referrer) try: f = urllib2.urlopen(req) content_type = f.info().gettype() if not content_type in ('text/html', 'application/xhtml+xml'): f.close() continue content = f.read() f.close() except urllib2.URLError as e: print >>sys.stderr, "%s: %s" % (url, e) continue try: doc = libxml2.htmlReadMemory(content, len(content), url, None, libxml2.HTML_PARSE_RECOVER | libxml2.HTML_PARSE_NOERROR | libxml2.HTML_PARSE_NOWARNING | libxml2.HTML_PARSE_NONET) ctx = doc.xpathNewContext() for node in ctx.xpathEvalExpression(xpath): content = node.content.strip() if node.type == 'attribute' and node.name in ('href', 'src'): content = urljoin(url, content) if is_multiple_source: print "%s:%s" % (url, content) else: print content if is_recursive: for node in ctx.xpathEvalExpression('//a/@href'): (next_url, fragment) = urldefrag(urljoin(url, node.content)) if next_url.startswith(root_url) and not next_url in visited: queue.append((next_url, url)) visited[next_url] = True ctx.xpathFreeContext() doc.freeDoc() except libxml2.treeError as e: print >>sys.stderr, "%s: %s" % (url, e)