def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True): if isinstance(raw, bytes): raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) raw = fix_self_closing_cdata_tags(raw) # TODO: Handle this in the parser if replace_entities: raw = xml_replace_entities(raw) if fix_newlines: raw = raw.replace('\r\n', '\n').replace('\r', '\n') raw = replace_chars.sub('', raw) stream_class = partial(FastStream, track_position=line_numbers) stream = stream_class(raw) builder = partial(NoNamespaceTreeBuilder if discard_namespaces else TreeBuilder, linenumber_attribute=linenumber_attribute) while True: try: parser = HTMLParser(tree=builder, track_positions=line_numbers, namespaceHTMLElements=not discard_namespaces) with warnings.catch_warnings(): warnings.simplefilter('ignore', category=DataLossWarning) try: parser.parse(stream, parseMeta=False, useChardet=False) finally: parser.tree.proxy_cache = None except NamespacedHTMLPresent as err: raw = re.sub(r'<\s*/{0,1}(%s:)' % err.prefix, lambda m: m.group().replace(m.group(1), ''), raw, flags=re.I) stream = stream_class(raw) continue break root = parser.tree.getDocument() if (discard_namespaces and root.tag != 'html') or ( not discard_namespaces and (root.tag != '{%s}%s' % (namespaces['html'], 'html') or root.prefix)): raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix)) return root
def validate_url(self, url, use_w3c=True, quite=True): 'validate urls with the w3c validator. Need an Internet Connection' client = Client() response = client.get(url, follow=True) if response.status_code == 200: src = response.content treebuilder = treebuilders.getTreeBuilder("etree") parser = HTMLParser(tree=treebuilder, strict=True) try: parser.parse(src) except Exception as ex: pass if not parser.errors and use_w3c: #uploading to w3c w3c = w3c_client(src) if w3c and not w3c[0]: print('%s: %s' % ( url, w3c[1], )) if not quite: for i in w3c[2]['messages']: print(i['messageid']) print('\t%s' % (i['message'], )) #self.assertTrue(w3c[0]) else: print('skipping html check %s', (response.status_code, ))
def runValidatorTest(self, test): p = HTMLParser(tokenizer=HTMLConformanceChecker) p.parse(test['input']) errorCodes = [errorcode for position, errorcode, datavars in p.errors] if test.has_key('fail-if'): self.failIf(test['fail-if'] in errorCodes) if test.has_key('fail-unless'): self.failUnless(test['fail-unless'] in errorCodes)
def validate_html(self, response): # only import this stuff if we need it! from html5lib.html5parser import HTMLParser from html5lib.filters.validator import HTMLConformanceChecker import pprint p = HTMLParser() p.parse(response.body) if (p.errors and not (len(p.errors) == 1 and p.errors[0][1] == 'unknown-doctype')): lines = response.body.splitlines() for (line, col), error, vars in p.errors: print "----------------------------------" print "Error: %s on line %s, column %s" % (error, line, col) print "%5d: %s" % (line, lines[line-1]) print " %s^" % (" " * col,) self.assert_(False)
def validate_html(self, response): # only import this stuff if we need it! from html5lib.html5parser import HTMLParser from html5lib.filters.validator import HTMLConformanceChecker import pprint p = HTMLParser() p.parse(response.body) if (p.errors and not (len(p.errors) == 1 and p.errors[0][1] == 'unknown-doctype')): lines = response.body.splitlines() for (line, col), error, vars in p.errors: print "----------------------------------" print "Error: %s on line %s, column %s" % (error, line, col) print "%5d: %s" % (line, lines[line - 1]) print " %s^" % (" " * col, ) self.assert_(False)
def cutHtml(text, max_len): parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml")) etree_document = parser.parse(text) sentinel = Sentinel(max_len) processItem(etree_document.getroot(), sentinel) if sentinel.stop: walker = treewalkers.getTreeWalker("lxml") stream = walker(etree_document.getroot().getchildren()[1]) s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False) output_generator = s.serialize(stream) output = [] for item in output_generator: output.append(item) output = output[1:-1] # remove <body></body> return ''.join(output) return None
def openLogin(): logging.info('Loading login page...') s = requests.Session() # trigger login page resp = s.get('https://developer.apple.com/download/', headers={ 'User-Agent': UA }) resp.raise_for_status() parser = HTMLParser(getTreeBuilder('dom')) parser = parser.parse(resp.text) form_input = bakeRequest(parser) logging.info('Logging in ...') resp = s.post('https://idmsa.apple.com/IDMSWebAuth/authenticate', headers={'User-Agent': UA}, data=form_input) resp.raise_for_status() if resp.url.find('authenticate') > 0: raise Exception('Login failed') logging.info('Fetching download token...') resp = s.post('https://developer.apple.com/services-account/QH65B2/downloadws/listDownloads.action', headers={'User-Agent': UA}, data='') resp.raise_for_status() generateDownload(s.cookies.items())
def openLogin(): logging.info('Loading login page...') s = requests.Session() # trigger login page resp = s.get('https://developer.apple.com/download/', headers={'User-Agent': UA}) resp.raise_for_status() parser = HTMLParser(getTreeBuilder('dom')) parser = parser.parse(resp.text) form_input = bakeRequest(parser) logging.info('Logging in ...') resp = s.post('https://idmsa.apple.com/IDMSWebAuth/authenticate', headers={'User-Agent': UA}, data=form_input) resp.raise_for_status() if resp.url.find('authenticate') > 0: raise Exception('Login failed') logging.info('Fetching download token...') resp = s.post( 'https://developer.apple.com/services-account/QH65B2/downloadws/listDownloads.action', headers={'User-Agent': UA}, data='') resp.raise_for_status() generateDownload(s.cookies.items())