def runSanitizerTest(name, expected, input): expected = ''.join([ token.toxml() for token in html5parser.HTMLParser().parseFragment( expected).childNodes ]) expected = json.loads(json.dumps(expected)) assert expected == sanitize_html(input)
def test_all_tokens(self): expected = [{ 'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'html' }, { 'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'head' }, { 'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'head' }, { 'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'body' }, { 'data': 'a', 'type': 'Characters' }, { 'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'div' }, { 'data': 'b', 'type': 'Characters' }, { 'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'div' }, { 'data': 'c', 'type': 'Characters' }, { 'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'body' }, { 'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'html' }] for treeName, treeCls in treeTypes.items(): p = html5parser.HTMLParser(tree=treeCls["builder"]) document = p.parse( "<html><head></head><body>a<div>b</div>c</body></html>") document = treeCls.get("adapter", lambda x: x)(document) output = treeCls["walker"](document) for expectedToken, outputToken in zip(expected, output): self.assertEqual(expectedToken, outputToken)
def content(xentry, name, detail, bozo): """ insert a content-like element into the entry """ if not detail or not detail.value: return data = None xdiv = '<div xmlns="http://www.w3.org/1999/xhtml">%s</div>' xdoc = xentry.ownerDocument xcontent = xdoc.createElement(name) if isinstance(detail.value, unicode): detail.value = detail.value.encode('utf-8') if not detail.has_key('type') or detail.type.lower().find('html') < 0: detail['value'] = escape(detail.value) detail['type'] = 'text/html' if detail.type.find('xhtml') >= 0 and not bozo: try: data = minidom.parseString(xdiv % detail.value).documentElement xcontent.setAttribute('type', 'xhtml') except: bozo = 1 if detail.type.find('xhtml') < 0 or bozo: parser = html5parser.HTMLParser( tree=treebuilders.getTreeBuilder('dom')) html = parser.parse(xdiv % detail.value, encoding="utf-8") for body in html.documentElement.childNodes: if body.nodeType != Node.ELEMENT_NODE: continue if body.nodeName != 'body': continue for div in body.childNodes: if div.nodeType != Node.ELEMENT_NODE: continue if div.nodeName != 'div': continue try: div.normalize() if len(div.childNodes) == 1 and \ div.firstChild.nodeType == Node.TEXT_NODE: data = div.firstChild if illegal_xml_chars.search(data.data): data = xdoc.createTextNode( illegal_xml_chars.sub(invalidate, data.data)) else: data = div xcontent.setAttribute('type', 'xhtml') break except: # in extremely nested cases, the Python runtime decides # that normalize() must be in an infinite loop; mark # the content as escaped html and proceed on... xcontent.setAttribute('type', 'html') data = xdoc.createTextNode(detail.value.decode('utf-8')) if data: xcontent.appendChild(data) if detail.get("language"): xcontent.setAttribute('xml:lang', detail.language) xentry.appendChild(xcontent)
def runtest(self): if self.treeAPIs is None: pytest.skip("Treebuilder not loaded") p = html5parser.HTMLParser(tree=self.treeAPIs["builder"]) input = self.test['data'] fragmentContainer = self.test['document-fragment'] expected = convertExpected(self.test['document']) scripting = False if 'script-on' in self.test: scripting = True with warnings.catch_warnings(): warnings.simplefilter("error") try: if fragmentContainer: document = p.parseFragment(input, fragmentContainer, scripting=scripting) else: document = p.parse(input, scripting=scripting) except constants.DataLossWarning: pytest.skip("data loss warning") poutput = convertTreeDump(p.tree.testSerializer(document)) namespace_expected = namespaceExpected(r"\1<html \2>", expected) if poutput != namespace_expected: pytest.skip("parser output incorrect") document = self.treeAPIs.get("adapter", lambda x: x)(document) try: output = treewalkers.pprint(Lint( self.treeAPIs["walker"](document))) output = sortattrs(output) expected = sortattrs(expected) diff = "".join( unified_diff([line + "\n" for line in expected.splitlines()], [line + "\n" for line in output.splitlines()], "Expected", "Received")) assert expected == output, "\n".join([ "", "Input:", input, "", "Expected:", expected, "", "Received:", output, "", "Diff:", diff, ]) except NotImplementedError: pytest.skip("tree walker NotImplementedError")
def runSanitizerTest(name, expected, input, toxml=None): if toxml is None: toxml = toxmlFactory() expected = ''.join([ toxml(token) for token in html5parser.HTMLParser().parseFragment(expected) ]) expected = json.loads(json.dumps(expected)) assert expected == sanitize_html(input)
def runtest(self): if self.treeClass is None: pytest.skip("Treebuilder not loaded") p = html5parser.HTMLParser( tree=self.treeClass, namespaceHTMLElements=self.namespaceHTMLElements) input = self.test['data'] fragmentContainer = self.test['document-fragment'] expected = convertExpected(self.test['document']) expectedErrors = self.test['errors'].split( "\n") if self.test['errors'] else [] scripting = False if 'script-on' in self.test: scripting = True with warnings.catch_warnings(): warnings.simplefilter("error") try: if fragmentContainer: document = p.parseFragment(input, fragmentContainer, scripting=scripting) else: document = p.parse(input, scripting=scripting) except constants.DataLossWarning: pytest.skip("data loss warning") output = convertTreeDump(p.tree.testSerializer(document)) expected = expected if self.namespaceHTMLElements: expected = namespaceExpected(r"\1<html \2>", expected) errorMsg = "\n".join([ "\n\nInput:", input, "\nExpected:", expected, "\nReceived:", output ]) assert expected == output, errorMsg errStr = [] for (line, col), errorcode, datavars in p.errors: assert isinstance(datavars, dict), "%s, %s" % (errorcode, repr(datavars)) errStr.append("Line: %i Col: %i %s" % (line, col, constants.E[errorcode] % datavars)) errorMsg2 = "\n".join([ "\n\nInput:", input, "\nExpected errors (" + str(len(expectedErrors)) + "):\n" + "\n".join(expectedErrors), "\nActual errors (" + str(len(p.errors)) + "):\n" + "\n".join(errStr) ]) if False: # we're currently not testing parse errors assert len(p.errors) == len(expectedErrors), errorMsg2
def runParserTest(innerHTML, input, expected, errors, treeClass, namespaceHTMLElements): #XXX - move this out into the setup function #concatenate all consecutive character tokens into a single token try: p = html5parser.HTMLParser(tree = treeClass, namespaceHTMLElements=namespaceHTMLElements) except constants.DataLossWarning: return try: if innerHTML: document = p.parseFragment(input, innerHTML) else: try: document = p.parse(input) except constants.DataLossWarning: return except: errorMsg = u"\n".join([u"\n\nInput:", input, u"\nExpected:", expected, u"\nTraceback:", traceback.format_exc()]) assert False, errorMsg.encode("utf8") output = convertTreeDump(p.tree.testSerializer(document)) expected = convertExpected(expected) if namespaceHTMLElements: expected = namespaceExpected(r"\1<html \2>", expected) errorMsg = u"\n".join([u"\n\nInput:", input, u"\nExpected:", expected, u"\nReceived:", output]) assert expected == output, errorMsg.encode("utf8") # errStr = [u"Line: %i Col: %i %s"%(line, col, # constants.E[errorcode] % datavars if isinstance(datavars, dict) else (datavars,)) for # ((line,col), errorcode, datavars) in p.errors] def datavars_sub(datavars, errorcode): if isinstance(datavars, dict): return datavars else: errstr = constants.E[errorcode] tgt = re.compile("(\%\(\w*\)s)") r = tgt.search(errstr) d = {} for i,g in enumerate(r.groups()): d[g[2:-2]] = datavars[i] return d errStr = [u"Line: %i Col: %i %s"%(line, col, constants.E[errorcode] % datavars_sub(datavars, errorcode)) for ((line,col), errorcode, datavars) in p.errors] errorMsg2 = u"\n".join([u"\n\nInput:", input, u"\nExpected errors (" + str(len(errors)) + u"):\n" + u"\n".join(errors), u"\nActual errors (" + str(len(p.errors)) + u"):\n" + u"\n".join(errStr)]) if checkParseErrors: assert len(p.errors) == len(errors), errorMsg2.encode("utf-8")
def favicon(page): parser = html5parser.HTMLParser(tree=treebuilders.getTreeBuilder('dom')) doc = parser.parse(urlopen(page)) favicon = urljoin(page, '/favicon.ico') for link in doc.getElementsByTagName('link'): if link.hasAttribute('rel') and link.hasAttribute('href'): if 'icon' in link.attributes['rel'].value.lower().split(' '): favicon = urljoin(page, link.attributes['href'].value) if urlopen(favicon).info()['content-length'] != '0': return favicon
def parse(text): # First run through the Markdown parser text = markdown.markdown(text, extensions=["extra"], safe_mode=False) # Sanitize using html5lib bits = [] parser = html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer) for token in parser.parseFragment(text).childNodes: bits.append(token.toxml()) return "".join(bits)
def runParserTest(innerHTML, input, expected, errors, treeClass, namespaceHTMLElements): with warnings.catch_warnings(record=True) as caughtWarnings: warnings.simplefilter('always') p = html5parser.HTMLParser(tree=treeClass, namespaceHTMLElements=namespaceHTMLElements) try: if innerHTML: document = p.parseFragment(input, innerHTML) else: document = p.parse(input) except: errorMsg = '\n'.join([ '\n\nInput:', input, '\nExpected:', expected, '\nTraceback:', traceback.format_exc() ]) assert False, errorMsg otherWarnings = [ x for x in caughtWarnings if not issubclass(x.category, constants.DataLossWarning) ] assert len(otherWarnings) == 0, [(x.category, x.message) for x in otherWarnings] if len(caughtWarnings): return output = convertTreeDump(p.tree.testSerializer(document)) expected = convertExpected(expected) if namespaceHTMLElements: expected = namespaceExpected(r'\1<html \2>', expected) errorMsg = '\n'.join( ['\n\nInput:', input, '\nExpected:', expected, '\nReceived:', output]) assert expected == output, errorMsg errStr = [] for (line, col), errorcode, datavars in p.errors: assert isinstance(datavars, dict), '%s, %s' % (errorcode, repr(datavars)) errStr.append('Line: %i Col: %i %s' % (line, col, constants.E[errorcode] % datavars)) errorMsg2 = '\n'.join([ '\n\nInput:', input, '\nExpected errors (' + str(len(errors)) + '):\n' + '\n'.join(errors), '\nActual errors (' + str(len(p.errors)) + '):\n' + '\n'.join(errStr) ]) if checkParseErrors: assert len(p.errors) == len(errors), errorMsg2
def test_fragment_single_char(tree, char): expected = [{'data': char, 'type': 'Characters'}] treeName, treeClass = tree if treeClass is None: pytest.skip("Treebuilder not loaded") parser = html5parser.HTMLParser(tree=treeClass["builder"]) document = parser.parseFragment(char) document = treeClass.get("adapter", lambda x: x)(document) output = Lint(treeClass["walker"](document)) assert list(output) == expected
def runTreewalkerEditTest(intext, expected, attrs_to_add, tree): """tests what happens when we add attributes to the intext""" treeName, treeClass = tree parser = html5parser.HTMLParser(tree=treeClass["builder"]) document = parser.parseFragment(intext) for nom, val in attrs_to_add: set_attribute_on_first_child(document, nom, val, treeName) document = treeClass.get("adapter", lambda x: x)(document) output = treewalkers.pprint(treeClass["walker"](document)) output = attrlist.sub(sortattrs, output) if not output in expected: raise AssertionError("TreewalkerEditTest: %s\nExpected:\n%s\nReceived:\n%s" % (treeName, expected, output))
def runParserTest(innerHTML, input, expected, errors, treeClass, namespaceHTMLElements): warnings.resetwarnings() warnings.simplefilter(u"error") #XXX - move this out into the setup function #concatenate all consecutive character tokens into a single token try: p = html5parser.HTMLParser(tree=treeClass, namespaceHTMLElements=namespaceHTMLElements) except constants.DataLossWarning: return try: if innerHTML: document = p.parseFragment(input, innerHTML) else: try: document = p.parse(input) except constants.DataLossWarning: return except: errorMsg = u"\n".join([ u"\n\nInput:", input, u"\nExpected:", expected, u"\nTraceback:", traceback.format_exc() ]) assert False, errorMsg output = convertTreeDump(p.tree.testSerializer(document)) expected = convertExpected(expected) if namespaceHTMLElements: expected = namespaceExpected(ur"\1<html \2>", expected) errorMsg = u"\n".join([ u"\n\nInput:", input, u"\nExpected:", expected, u"\nReceived:", output ]) assert expected == output, errorMsg errStr = [ u"Line: %i Col: %i %s" % (line, col, constants.E[errorcode] % datavars if isinstance(datavars, dict) else (datavars, )) for ((line, col), errorcode, datavars) in p.errors ] errorMsg2 = u"\n".join([ u"\n\nInput:", input, u"\nExpected errors (" + unicode(len(errors)) + u"):\n" + u"\n".join(errors), u"\nActual errors (" + unicode(len(p.errors)) + u"):\n" + u"\n".join(errStr) ]) if checkParseErrors: assert len(p.errors) == len(errors), errorMsg2
def parse(text): sanitizer.HTMLSanitizer.allowed_elements.extend(['iframe']) sanitizer.HTMLSanitizer.allowed_attributes.extend( ['scrolling', 'allowfullscreen', 'frameborder']) # First run through the Markdown parser text = markdown.markdown(text, extensions=["extra"], safe_mode=False) # Sanitize using html5lib bits = [] parser = html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer, tree=getTreeBuilder("dom")) for token in parser.parseFragment(text).childNodes: bits.append(token.toxml()) return "".join(bits)
def runParserTest(self, innerHTML, input, expected, errors, treeClass): #XXX - move this out into the setup function #concatenate all consecutive character tokens into a single token p = html5parser.HTMLParser(tree = treeClass) if innerHTML: innerHTML = str(innerHTML, "utf8") if errors: errors = str(errors, "utf8") errors = errors.split("\n") expected = str(expected, "utf8") try: if innerHTML: document = p.parseFragment(io.BytesIO(input), innerHTML) else: try: document = p.parse(io.BytesIO(input)) except constants.DataLossWarning: sys.stderr.write("Test input causes known dataloss, skipping") return except: errorMsg = "\n".join(["\n\nInput:", str(input, "utf8"), "\nExpected:", expected, "\nTraceback:", traceback.format_exc()]) self.assertTrue(False, errorMsg) output = convertTreeDump(p.tree.testSerializer(document)) output = attrlist.sub(sortattrs, output) expected = convertExpected(expected) expected = attrlist.sub(sortattrs, expected) errorMsg = "\n".join(["\n\nInput:", str(input, "utf8"), "\nExpected:", expected, "\nReceived:", output]) self.assertEquals(expected, output, errorMsg) errStr = ["Line: %i Col: %i %s %s"%(line, col, constants.E[errorcode], datavars) for ((line,col), errorcode, datavars) in p.errors] errorMsg2 = "\n".join(["\n\nInput:", str(input, "utf8"), "\nExpected errors (" + str(len(errors)) + "):\n" + "\n".join(errors), "\nActual errors (" + str(len(p.errors)) + "):\n" + "\n".join(errStr)]) if checkParseErrors: self.assertEquals(len(p.errors), len(errors), errorMsg2)
def parse(): optParser = getOptParser() opts, args = optParser.parse_args() encoding = None try: f = args[-1] # Try opening from the internet if f.startswith('http://'): try: import urllib, cgi f = urllib.urlopen(f) contentType = f.headers.get('content-type') if contentType: (mediaType, params) = cgi.parse_header(contentType) encoding = params.get('charset') except: pass elif f == '-': f = sys.stdin else: try: # Try opening from file system f = open(f) except IOError: pass except IndexError: sys.stderr.write("No filename provided. Use -h for help\n") sys.exit(1) treebuilder = treebuilders.getTreeBuilder("simpleTree") # if opts.xml: # p = liberalxmlparser.XHTMLParser(tree=treebuilder) # else: if 1: p = html5parser.HTMLParser(tree=treebuilder, tokenizer=validator.HTMLConformanceChecker) document = p.parse(f, encoding=encoding) printOutput(p, document, opts)
def runTreewalkerTest(innerHTML, input, expected, errors, treeClass): try: p = html5parser.HTMLParser(tree=treeClass["builder"]) if innerHTML: document = p.parseFragment(input, innerHTML) else: document = p.parse(input) except constants.DataLossWarning: #Ignore testcases we know we don't pass return document = treeClass.get("adapter", lambda x: x)(document) try: output = convertTokens(treeClass["walker"](document)) output = attrlist.sub(sortattrs, output) expected = attrlist.sub(sortattrs, convertExpected(expected)) assert expected == output, "\n".join([ "", "Input:", input, "", "Expected:", expected, "", "Received:", output ]) except NotImplementedError: pass # Amnesty for those that confess...
def runTreewalkerTest(innerHTML, input, expected, errors, treeClass): warnings.resetwarnings() warnings.simplefilter('error') try: p = html5parser.HTMLParser(tree=treeClass['builder']) if innerHTML: document = p.parseFragment(input, innerHTML) else: document = p.parse(input) except constants.DataLossWarning: # Ignore testcases we know we don't pass return document = treeClass.get('adapter', lambda x: x)(document) try: output = treewalkers.pprint(treeClass['walker'](document)) output = attrlist.sub(sortattrs, output) expected = attrlist.sub(sortattrs, convertExpected(expected)) diff = ''.join( unified_diff([line + '\n' for line in expected.splitlines()], [line + '\n' for line in output.splitlines()], 'Expected', 'Received')) assert expected == output, '\n'.join([ '', 'Input:', input, '', 'Expected:', expected, '', 'Received:', output, '', 'Diff:', diff, ]) except NotImplementedError: pass # Amnesty for those that confess...
def runTreewalkerTest(innerHTML, input, expected, errors, treeClass): warnings.resetwarnings() warnings.simplefilter("error") try: p = html5parser.HTMLParser(tree=treeClass["builder"]) if innerHTML: document = p.parseFragment(input, innerHTML) else: document = p.parse(input) except constants.DataLossWarning: # Ignore testcases we know we don't pass return document = treeClass.get("adapter", lambda x: x)(document) try: output = convertTokens(treeClass["walker"](document)) output = attrlist.sub(sortattrs, output) expected = attrlist.sub(sortattrs, convertExpected(expected)) diff = "".join( unified_diff([line + "\n" for line in expected.splitlines()], [line + "\n" for line in output.splitlines()], "Expected", "Received")) assert expected == output, "\n".join([ "", "Input:", input, "", "Expected:", expected, "", "Received:", output, "", "Diff:", diff, ]) except NotImplementedError: pass # Amnesty for those that confess...
def sanitize_html(self, stream): return ''.join([ token.toxml() for token in html5parser.HTMLParser( tokenizer=sanitizer.HTMLSanitizer).parseFragment( stream).childNodes ])
def scrub(feed_uri, data): # some data is not trustworthy for tag in config.ignore_in_feed(feed_uri).split(): if tag.find('lang') >= 0: tag = 'language' if data.feed.has_key(tag): del data.feed[tag] for entry in data.entries: if entry.has_key(tag): del entry[tag] if entry.has_key(tag + "_detail"): del entry[tag + "_detail"] if entry.has_key(tag + "_parsed"): del entry[tag + "_parsed"] for key in entry.keys(): if not key.endswith('_detail'): continue for detail in entry[key].copy(): if detail == tag: del entry[key][detail] # adjust title types if config.title_type(feed_uri): title_type = config.title_type(feed_uri) title_type = type_map.get(title_type, title_type) for entry in data.entries: if entry.has_key('title_detail'): entry.title_detail['type'] = title_type # adjust summary types if config.summary_type(feed_uri): summary_type = config.summary_type(feed_uri) summary_type = type_map.get(summary_type, summary_type) for entry in data.entries: if entry.has_key('summary_detail'): entry.summary_detail['type'] = summary_type # adjust content types if config.content_type(feed_uri): content_type = config.content_type(feed_uri) content_type = type_map.get(content_type, content_type) for entry in data.entries: if entry.has_key('content'): entry.content[0]['type'] = content_type # some people put html in author names if config.name_type(feed_uri).find('html') >= 0: from shell.tmpl import stripHtml if data.feed.has_key('author_detail') and \ data.feed.author_detail.has_key('name'): data.feed.author_detail['name'] = \ str(stripHtml(data.feed.author_detail.name)) for entry in data.entries: if entry.has_key('author_detail') and \ entry.author_detail.has_key('name'): entry.author_detail['name'] = \ str(stripHtml(entry.author_detail.name)) if entry.has_key('source'): source = entry.source if source.has_key('author_detail') and \ source.author_detail.has_key('name'): source.author_detail['name'] = \ str(stripHtml(source.author_detail.name)) # handle dates in the future future_dates = config.future_dates(feed_uri).lower() if future_dates == 'ignore_date': now = time.gmtime() if data.feed.has_key('updated_parsed') and data.feed['updated_parsed']: if data.feed['updated_parsed'] > now: del data.feed['updated_parsed'] for entry in data.entries: if entry.has_key('published_parsed') and entry['published_parsed']: if entry['published_parsed'] > now: del entry['published_parsed'] del entry['published'] if entry.has_key('updated_parsed') and entry['updated_parsed']: if entry['updated_parsed'] > now: del entry['updated_parsed'] del entry['updated'] elif future_dates == 'ignore_entry': now = time.time() if data.feed.has_key('updated_parsed') and data.feed['updated_parsed']: if data.feed['updated_parsed'] > now: del data.feed['updated_parsed'] data.entries = [ entry for entry in data.entries if (not entry.has_key('published_parsed') or not entry['published_parsed'] or entry['published_parsed'] <= now) and (not entry.has_key('updated_parsed') or not entry['updated_parsed'] or entry['updated_parsed'] <= now) ] scrub_xmlbase = config.xml_base(feed_uri) # resolve relative URIs and sanitize for entry in data.entries + [data.feed]: for key in entry.keys(): if key == 'content' and not entry.has_key('content_detail'): node = entry.content[0] elif key.endswith('_detail'): node = entry[key] else: continue if not node.has_key('type'): continue if not 'html' in node['type']: continue if not node.has_key('value'): continue if node.has_key('base'): if scrub_xmlbase: if scrub_xmlbase == 'feed_alternate': if entry.has_key('source') and \ entry.source.has_key('link'): node['base'] = entry.source.link elif data.feed.has_key('link'): node['base'] = data.feed.link elif scrub_xmlbase == 'entry_alternate': if entry.has_key('link'): node['base'] = entry.link else: node['base'] = feedparser._urljoin( node['base'], scrub_xmlbase) node['value'] = feedparser._resolveRelativeURIs( node.value, node.base, 'utf-8', node.type) if node['value']: # Run this through HTML5's sanitizer doc = None if 'xhtml' in node['type']: try: from xml.dom import minidom doc = minidom.parseString(node['value']) except: node['type'] = 'text/html' if not doc: from html5lib import html5parser, treebuilders, sanitizer p = html5parser.HTMLParser( tree=treebuilders.getTreeBuilder('dom'), tokenizer=sanitizer.HTMLSanitizer) doc = p.parseFragment(node['value'], encoding='utf-8') from html5lib import treewalkers, serializer walker = treewalkers.getTreeWalker('dom')(doc) xhtml = serializer.HTMLSerializer(inject_meta_charset=False) tree = xhtml.serialize(walker, encoding='utf-8') node['value'] = ''.join([str(token) for token in tree])
def test_unicode_file(self): parser = html5parser.HTMLParser() parser.parse(io.StringIO("a"))
def test_namespace_html_elements_1_etree(self): parser = html5parser.HTMLParser(namespaceHTMLElements=False) doc = parser.parse("<html></html>") self.assertTrue(doc.tag == "html")
def test_namespace_html_elements_0_etree(self): parser = html5parser.HTMLParser(namespaceHTMLElements=True) doc = parser.parse("<html></html>") self.assertTrue(doc.tag == "{%s}html" % (namespaces["html"],))
def test_namespace_html_elements_1_dom(self): parser = html5parser.HTMLParser(tree=self.dom_tree, namespaceHTMLElements=False) doc = parser.parse("<html></html>") self.assertTrue(doc.childNodes[0].namespaceURI is None)
def test_line_counter(self): # http://groups.google.com/group/html5lib-discuss/browse_frm/thread/f4f00e4a2f26d5c0 parser = html5parser.HTMLParser(tree=self.dom_tree) parser.parse("<pre>\nx\n>\n</pre>")
def test_assertDoctypeCloneable(self): parser = html5parser.HTMLParser(tree=self.dom_tree) doc = parser.parse('<!DOCTYPE HTML>') self.assertTrue(doc.cloneNode(True))
def test_namespace_html_elements_0(self): parser = html5parser.HTMLParser(namespaceHTMLElements=True) doc = parser.parse(u"<html></html>") self.assert_(doc.childNodes[0].namespace == namespaces[u"html"])
def test_namespace_html_elements_1(self): parser = html5parser.HTMLParser(namespaceHTMLElements=False) doc = parser.parse(u"<html></html>") self.assert_(doc.childNodes[0].namespace == None)
def parse(): optParser = getOptParser() opts, args = optParser.parse_args() encoding = "utf8" try: f = args[-1] # Try opening from the internet if f.startswith('http://'): try: import urllib.request import urllib.parse import urllib.error import cgi f = urllib.request.urlopen(f) contentType = f.headers.get('content-type') if contentType: (mediaType, params) = cgi.parse_header(contentType) encoding = params.get('charset') except: pass elif f == '-': f = sys.stdin if sys.version_info[0] >= 3: encoding = None else: try: # Try opening from file system f = open(f, "rb") except IOError as e: sys.stderr.write("Unable to open file: %s\n" % e) sys.exit(1) except IndexError: sys.stderr.write("No filename provided. Use -h for help\n") sys.exit(1) treebuilder = treebuilders.getTreeBuilder(opts.treebuilder) p = html5parser.HTMLParser(tree=treebuilder, debug=opts.log) if opts.fragment: parseMethod = p.parseFragment else: parseMethod = p.parse if opts.profile: import cProfile import pstats cProfile.runctx("run(parseMethod, f, encoding, scripting)", None, {"run": run, "parseMethod": parseMethod, "f": f, "encoding": encoding, "scripting": opts.scripting}, "stats.prof") # XXX - We should use a temp file here stats = pstats.Stats('stats.prof') stats.strip_dirs() stats.sort_stats('time') stats.print_stats() elif opts.time: import time t0 = time.time() document = run(parseMethod, f, encoding, opts.scripting) t1 = time.time() if document: printOutput(p, document, opts) t2 = time.time() sys.stderr.write("\n\nRun took: %fs (plus %fs to print the output)" % (t1 - t0, t2 - t1)) else: sys.stderr.write("\n\nRun took: %fs" % (t1 - t0)) else: document = run(parseMethod, f, encoding, opts.scripting) if document: printOutput(p, document, opts)