def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True): if isinstance(raw, bytes): raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) raw = fix_self_closing_cdata_tags(raw) # TODO: Handle this in the parser if replace_entities: raw = xml_replace_entities(raw) if fix_newlines: raw = raw.replace('\r\n', '\n').replace('\r', '\n') raw = replace_chars.sub('', raw) stream_class = partial(FastStream, track_position=line_numbers) stream = stream_class(raw) builder = partial(NoNamespaceTreeBuilder if discard_namespaces else TreeBuilder, linenumber_attribute=linenumber_attribute) while True: try: parser = HTMLParser(tree=builder, track_positions=line_numbers, namespaceHTMLElements=not discard_namespaces) with warnings.catch_warnings(): warnings.simplefilter('ignore', category=DataLossWarning) try: parser.parse(stream, parseMeta=False, useChardet=False) finally: parser.tree.proxy_cache = None except NamespacedHTMLPresent as err: raw = re.sub(r'<\s*/{0,1}(%s:)' % err.prefix, lambda m: m.group().replace(m.group(1), ''), raw, flags=re.I) stream = stream_class(raw) continue break root = parser.tree.getDocument() if (discard_namespaces and root.tag != 'html') or ( not discard_namespaces and (root.tag != '{%s}%s' % (namespaces['html'], 'html') or root.prefix)): raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix)) return root
def validate_url(self, url, use_w3c=True, quite=True): 'validate urls with the w3c validator. Need an Internet Connection' client = Client() response = client.get(url, follow=True) if response.status_code == 200: src = response.content treebuilder = treebuilders.getTreeBuilder("etree") parser = HTMLParser(tree=treebuilder, strict=True) try: parser.parse(src) except Exception as ex: pass if not parser.errors and use_w3c: #uploading to w3c w3c = w3c_client(src) if w3c and not w3c[0]: print('%s: %s' % ( url, w3c[1], )) if not quite: for i in w3c[2]['messages']: print(i['messageid']) print('\t%s' % (i['message'], )) #self.assertTrue(w3c[0]) else: print('skipping html check %s', (response.status_code, ))
def runValidatorTest(self, test): p = HTMLParser(tokenizer=HTMLConformanceChecker) p.parse(test['input']) errorCodes = [errorcode for position, errorcode, datavars in p.errors] if test.has_key('fail-if'): self.failIf(test['fail-if'] in errorCodes) if test.has_key('fail-unless'): self.failUnless(test['fail-unless'] in errorCodes)
def cutHtml(text, max_len): parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml")) etree_document = parser.parse(text) sentinel = Sentinel(max_len) processItem(etree_document.getroot(), sentinel) if sentinel.stop: walker = treewalkers.getTreeWalker("lxml") stream = walker(etree_document.getroot().getchildren()[1]) s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False) output_generator = s.serialize(stream) output = [] for item in output_generator: output.append(item) output = output[1:-1] # remove <body></body> return ''.join(output) return None
def validate_html(self, response): # only import this stuff if we need it! from html5lib.html5parser import HTMLParser from html5lib.filters.validator import HTMLConformanceChecker import pprint p = HTMLParser() p.parse(response.body) if (p.errors and not (len(p.errors) == 1 and p.errors[0][1] == 'unknown-doctype')): lines = response.body.splitlines() for (line, col), error, vars in p.errors: print "----------------------------------" print "Error: %s on line %s, column %s" % (error, line, col) print "%5d: %s" % (line, lines[line - 1]) print " %s^" % (" " * col, ) self.assert_(False)
def __init__(self,value,element): self.element=element self.valid = True self.parser = HTMLParser(strict=True) if value.lower().find('<?import ') >= 0: self.log(SecurityRisk({"parent":self.element.parent.name, "element":self.element.name, "tag":"?import"})) try: etree = self.parser.parseFragment(value) if self.valid: self.log(ValidHtml({"parent":self.element.parent.name, "element":self.element.name})) from pprint import pprint for tag in etree.iter(): if tag.tag != "DOCUMENT_FRAGMENT": self.handle_tag(tag.tag.split('}')[-1], tag.attrib, tag.text) except ParseError as msg: element = self.element offset = [element.line - element.dispatcher.locator.getLineNumber(), - element.dispatcher.locator.getColumnNumber()] match = re.search(', at line (\d+), column (\d+)',str(msg)) if match: offset[0] += int(match.group(1))-1 element.log(NotHtml({"parent":element.parent.name, "element":element.name, "message":"Invalid HTML", "value": str(msg)}),offset)
def thumbnails(html): """ Given a HTML string, converts paths in img tags to thumbnail paths, using Mezzanine's ``thumbnail`` template tag. Used as one of the default values in the ``RICHTEXT_FILTERS`` setting. """ from django.conf import settings from html5lib.treebuilders import getTreeBuilder from html5lib.html5parser import HTMLParser #from mezzanine.core.templatetags.mezzanine_tags import thumbnail from asylum_custom.templatetags.asylum_tags import thumbnail dom = HTMLParser(tree=getTreeBuilder("dom")).parse(html) for img in dom.getElementsByTagName("img"): src = img.getAttribute("src") width = img.getAttribute("width") height = img.getAttribute("height") if src and width and height: src = settings.MEDIA_URL + thumbnail(src, width, height) img.setAttribute("src", src) nodes = dom.getElementsByTagName("body")[0].childNodes return "".join([node.toxml() for node in nodes])
def openLogin(): logging.info('Loading login page...') s = requests.Session() # trigger login page resp = s.get('https://developer.apple.com/download/', headers={'User-Agent': UA}) resp.raise_for_status() parser = HTMLParser(getTreeBuilder('dom')) parser = parser.parse(resp.text) form_input = bakeRequest(parser) logging.info('Logging in ...') resp = s.post('https://idmsa.apple.com/IDMSWebAuth/authenticate', headers={'User-Agent': UA}, data=form_input) resp.raise_for_status() if resp.url.find('authenticate') > 0: raise Exception('Login failed') logging.info('Fetching download token...') resp = s.post( 'https://developer.apple.com/services-account/QH65B2/downloadws/listDownloads.action', headers={'User-Agent': UA}, data='') resp.raise_for_status() generateDownload(s.cookies.items())
def fixHtml(html): p = HTMLParser() return ''.join( [token.toxml() for token in p.parseFragment(html).childNodes])