def parse(self, content): failed = False p = html5lib.HTMLParser(tree=etree.TreeBuilder) try: tree = p.parse(content) except: self.buggyURLs.add(self.currentURL) failed = True print("BUGGY:", self.currentURL) self.visitedURLs.add(self.currentURL) if not failed: self.updateURLs(tree)
def test_sanitizer(self): allowed_attrs = html5lib.sanitizer.HTMLSanitizer.allowed_attributes[:] sanitizer.TextSanitizer.allow_token_parsers = ( attribute_parsers.DataAttributeParser, ) parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"), tokenizer=sanitizer.TextSanitizer) body = '<span data-one="1" data-two="2">some text</span>' body = html.clean_html(body, full=False, parser=parser) self.assertTrue('data-one="1"' in body) self.assertTrue('data-two="2"' in body) self.assertEqual(allowed_attrs, html5lib.sanitizer.HTMLSanitizer.allowed_attributes)
def assertValidHTML(self, content, msg=None): parser = html5lib.HTMLParser() parser.parseFragment(self.panel.content) if parser.errors: default_msg = ['Content is invalid HTML:'] lines = content.split('\n') for position, errorcode, datavars in parser.errors: default_msg.append(' %s' % html5lib.constants.E[errorcode] % datavars) default_msg.append(' %s' % lines[position[0] - 1]) msg = self._formatMessage(msg, '\n'.join(default_msg)) raise self.failureException(msg)
def get_document_for(srcname): document = None # an xml.dom.minidom document if is_html(srcname): # An HTML file f = open(srcname, "rb") parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom")) document = parser.parse(f) f.close() else: # An XML file document = xml.dom.minidom.parse(srcname) return document
def get_toc(self, path): # Only have TOC on tutorial pages. Don't do work for others. if not (re.search('/tutorials', path) or re.search('/mobile', path) or re.search('style-guide', path)): return '' toc = memcache.get('%s|toc|%s' % (settings.MEMCACHE_KEY_PREFIX, path)) if toc is None or not self.request.cache: template_text = render_to_string(path, {}) parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom")) dom_tree = parser.parse(template_text) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) toc = [] current = None innerTagCount = 0 for element in stream: if element['type'] == 'StartTag': if element['name'] in ['h2']: for attr in element['data']: if attr[0] == 'id': current = { 'level' : int(element['name'][-1:]) - 1, 'id' : attr[1], 'text': '' } elif current is not None: innerTagCount += 1 elif element['type'] == 'Characters' and current is not None: # if we already have text check: # - whether the last character is a < or a ( # - the string being added starts with > or ) # in which case do not add a space if current['text'] != '': if current['text'][-1] != '<' and not re.match(r"^[\>\)]", element['data']): current['text'] += ' ' current['text'] = current['text'] + element['data'] elif element['type'] == 'EndTag' and current is not None: if innerTagCount > 0: innerTagCount -= 1 else: current['text'] = cgi.escape(current['text']) toc.append(current) current = None memcache.set('%s|toc|%s' % (settings.MEMCACHE_KEY_PREFIX, path), toc, 3600) return toc
def parse(self, live=False): """ Parse historical urls for a web page over years. We first determine the year scale that has valid snapshots. @Return: list of historical urls or None """ self._parse_called = True wayback_page_whole = self.open_url(self.get_wayback_page(self.url)) if wayback_page_whole == None: return None parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("lxml")) html_doc = parser.parse(wayback_page_whole) position_div = html_doc.find("./{*}body/{*}div[@id='position']") sketchinfo = position_div.find( "./{*}div[@id='wbSearch']/{*}div[@id='form']/{*}div[@id='wbMeta']/{*}p[@class='wbThis']" ) first_url = sketchinfo.getchildren()[-1].attrib['href'] first_year = self.extract_year(first_url) for year in range(first_year, datetime.datetime.now().year + 1): # Be polite to the host server time.sleep(random.randint(1, 3)) # Note: the timestamp in search url indicates the time scale of query: # E.g., wildcard * matches all of the items in specific year. # If only * is supported, the results of latest year are returned. # I found that it returned wrong results if the month and day numbers are small like 0101, # so a bigger number is used to match wildly. wayback_page_year = "%s/%d0601000000*/%s" % (self.prefix, year, self.url) page_year, his_urls = self._parse_wayback_page(wayback_page_year) # To exclude duplicated items that don't match the year # By default the results of latest year are returned # if some year hasn't been crawled if page_year == None or page_year != year: continue module_logger.debug("%s: %d pages found for year %d" % (self.url, len(his_urls), page_year)) for url in his_urls: try: page_year = self.extract_year(url) except: module_logger.error( "Invalid timestamp of wayback url: %s" % url) continue if year == page_year: if live: self.add_item(year, self.convert_live_url(url)) else: self.add_item(year, url) return self.results
def get_items(location, encoding=None): """ Pass in a string or file-like object and get a list of Items present in the HTML document. """ dom_builder = html5lib.treebuilders.getTreeBuilder("dom") parser = html5lib.HTMLParser(tree=dom_builder) if encoding: tree = parser.parse(location, transport_encoding=encoding) else: tree = parser.parse(location) return _find_items(tree)
def get_document_for(fn, spec): document = None # an xml.dom.minidom document if fn.endswith(".htm") or fn.endswith(".html"): # An HTML file f = open(fn, "r") parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom")) document = parser.parse(f) f.close() else: # An XML file document = xml.dom.minidom.parse(fn) return document
def _html5lib_parser(): """ html5lib is a pure-python library that conforms to the WHATWG HTML spec and is not vulnarable to certain attacks common for XML libraries """ return html5lib.HTMLParser( # build lxml tree html5lib.treebuilders.getTreeBuilder("lxml"), # remove namespace value from inside lxml.html.html5paser element tag # otherwise it yields something like "{http://www.w3.org/1999/xhtml}div" # instead of "div", throwing the algo off namespaceHTMLElements=False)
def parser(self): """ Here I use html5lib so parse the pages retrieved. I am using BeautifulSoup as my parser here and I know it is deprecated. I will change this soon... Content is taken only from <p> tags, so this could be a lot more robust. All words are stemmed and stopwords are removed. """ #get stopwords; remove newline char parsed_html = {} stopwords = [word[:-1] for word in open('stopwords.txt')] pstemmer = stemmer.PorterStemmer() htmldocs = os.listdir('pages/') #grap all html docs and parse them words_splitter = re.compile(r'\W*') #split on non words for htmldoc in htmldocs: f = open('pages/' + htmldoc, 'r') link = f.readline() html = f.readlines() try: print htmldoc p = html5lib.HTMLParser( tree=treebuilders.getTreeBuilder('beautifulsoup')) tree = p.parse(html) except: os.remove(os.path.join('pages', htmldoc)) print 'error parsing %s' % htmldoc continue title = tree.findAll('title') if title: title = title[0].text else: title = '' #grab text from p tags data = [p.text.lower() for p in tree.findAll('p')] #remove stopwords unstemmed_words = [ word for word in words_splitter.split(''.join(data)) if word != '' and word not in stopwords ] stemmed_words = [ pstemmer.stem(word, 0, len(word) - 1) for word in unstemmed_words ] parsed_html[(title, int(htmldoc), link)] = stemmed_words return parsed_html
def test(request): if request.method == "POST": text = request.POST['text'] html5parser = html5lib.HTMLParser(strict=True) try: ans_text = html5parser.parse(text) except: ans_text = False if ans_text: return HttpResponse(text) else: return HttpResponse("Error") return render(request,'index.html')
def _selectors(html): htmlfile = StringIO.StringIO(html) parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom")) doc = parser.parse(htmlfile, encoding='utf8') nodes = _child_elements(doc) result = set() while nodes: node = nodes.pop() result.add(node.nodeName) for attr in node.attributes.keys(): result.add("%s[%s]" % (node.nodeName, attr)) nodes.extend(_child_elements(node)) return result
def sanitize_html(html): """Sanitizes an HTML fragment. from forbidden markup """ p = html5lib.HTMLParser(tokenizer=HTMLSanitizer, tree=treebuilders.getTreeBuilder("dom")) dom_tree = p.parseFragment(html) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = serializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values=True) output_generator = s.serialize(stream) return ''.join(output_generator)
def parse_html_string(self, html_str): """Parse the given HTML string to a XML DOM tree. Args: html_str: string. The HTML document to be parsed. Returns: An ElementTree representation of the DOM. """ parser = html5lib.HTMLParser( tree=html5lib.treebuilders.getTreeBuilder('etree', cElementTree), namespaceHTMLElements=False) return parser.parse(html_str)
def test_html5_validation(self): response = self.client.get("/regular/HTML5/") parser = html5lib.HTMLParser() content = response.content parser.parse(content) if parser.errors: default_msg = ["Content is invalid HTML:"] lines = content.split(b"\n") for position, errorcode, datavars in parser.errors: default_msg.append(" %s" % html5lib.constants.E[errorcode] % datavars) default_msg.append(" %r" % lines[position[0] - 1]) msg = self._formatMessage(None, "\n".join(default_msg)) raise self.failureException(msg)
def feed(self, markup): if self.soup.parse_only is not None: warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") parser = html5lib.HTMLParser(tree=self.create_treebuilder) doc = parser.parse(markup, encoding=self.user_specified_encoding) # Set the character encoding detected by the tokenizer. if isinstance(markup, unicode): # We need to special-case this because html5lib sets # charEncoding to UTF-8 if it gets Unicode input. doc.original_encoding = None else: doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
def getSoup(spec_url, hset, domain): if domain not in spec_url: url = domain + spec_url else: url = spec_url data = URLGRABBER.urlread(url) parser = html5lib.HTMLParser( tree=treebuilders.getTreeBuilder("beautifulsoup")) soup = parser.parse(data) #soup = Soup.BeautifulSoup(data) return soup
def get_components_using_html5lib(html): """Find lesson components using the pure python html5lib library.""" parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder( 'etree', cElementTree), namespaceHTMLElements=False) content = parser.parseFragment('<div>%s</div>' % html)[0] components = [] for component in content.findall('.//*[@instanceid]'): component_dict = {'cpt_name': component.tag} component_dict.update(component.attrib) components.append(component_dict) return components
def test_with_serializer(): """Verify filter works in the context of everything else""" parser = html5lib.HTMLParser() dom = parser.parseFragment( '<svg><pattern xlink:href="#patt2" id="patt1"></svg>') walker = html5lib.getTreeWalker('etree') ser = HTMLSerializer(alphabetical_attributes=True, quote_attr_values='always') # FIXME(willkg): The "xlink" namespace gets dropped by the serializer. When # that gets fixed, we can fix this expected result. assert (ser.render(walker(dom)) == '<svg><pattern id="patt1" href="#patt2"></pattern></svg>')
def get_xpaths(assembly_id): url = "http://ko.wikipedia.org/wiki/대한민국_제%s대_국회의원_목록"\ % assembly_id p = html5lib.HTMLParser(\ tree=html5lib.treebuilders.getTreeBuilder("lxml"),\ namespaceHTMLElements=False) r = urllib2.Request(url) r.add_header("User-Agent", settings["USER_AGENT"]) f = urllib2.urlopen(r) page = p.parse(f) xpaths = page.xpath(settings["X_PATH"]) return xpaths
def embed(rawhtml, outfile, rootdirs=(serve.cwd, )): parser = html5lib.HTMLParser( tree=html5lib.treebuilders.getTreeBuilder("dom")) dom = parser.parse(rawhtml) head = dom.getElementsByTagName("head")[0] wurl = dom.createElement("script") wurl.setAttribute("type", "text/javascript") wurl.appendChild( dom.createTextNode(''' if (window.webkitURL) window.URL = window.webkitURL; ''')) head.insertBefore(wurl, head.childNodes[0]) for script in dom.getElementsByTagName("script"): stype = script.getAttribute("type") if stype == "text/html": sdom = parser.parse(script.childNodes[0].wholeText) _embed_images(sdom, rootdirs) while len(script.childNodes) > 0: for c in script.childNodes: script.removeChild(c) shtml = "" for el in sdom.getElementsByTagName("body")[0].childNodes: shtml += el.toxml() script.appendChild(dom.createTextNode(shtml)) elif stype == "text/javascript": src = script.getAttribute("src") if len(src) > 0: _embed_js(dom, script, rootdirs) for css in dom.getElementsByTagName("link"): if (css.getAttribute("type") == "text/css"): csstext = _embed_css( _resolve_path(css.getAttribute("href"), rootdirs), rootdirs) ncss = dom.createElement("style") ncss.setAttribute("type", "text/css") ncss.appendChild(dom.createTextNode(csstext)) css.parentNode.insertBefore(ncss, css) css.parentNode.removeChild(css) _embed_images(dom, rootdirs) #Save out the new html file with open(outfile, "w") as htmlfile: serializer = html5lib.serializer.htmlserializer.HTMLSerializer() walker = html5lib.treewalkers.getTreeWalker("dom") for line in serializer.serialize(walker(dom)): htmlfile.write(line.encode("utf-8"))
def parse_html(content): try: document = html5lib.parse(content, namespaceHTMLElements=False) if not document: # Could not parse return content # Because html5lib parses like a browser, it will # always create head and body tags if they are missing. head = document.find("head") for file in get_files("htmlScreenshot", "js"): SubElement(head, "script", attrib={"src": file['url']}) # Currently, html5lib strips the doctype, but it's important for correct rendering, so check the original # content for the doctype and, if found, prepend it to the content serialized by html5lib doctype = None try: # Now parse the content as a dom tree instead, so that we capture # any doctype node as a dom node that we can read. tree_builder_dom = html5lib.treebuilders.getTreeBuilder("dom") parser_dom = html5lib.HTMLParser(tree_builder_dom, namespaceHTMLElements=False) tree = parser_dom.parse(content) # By HTML Spec if doctype is included, it must be the first thing # in the document, so it has to be the first child node of the document doctype_node = tree.childNodes[0] # Check that this node is in fact a doctype node if doctype_node.nodeType == doctype_node.DOCUMENT_TYPE_NODE: # render to a string by calling the toxml method # toxml uses single quotes by default, replace with "" doctype = doctype_node.toxml().replace("'", '"') except Exception as e: logging.warn( "Error in HTML5 parsing to determine doctype {}".format(e)) html = html5lib.serialize( document, quote_attr_values="always", omit_optional_tags=False, minimize_boolean_attributes=False, use_trailing_solidus=True, space_before_trailing_solidus=False, ) if doctype: html = doctype + html return html except html5lib.html5parser.ParseError: return content
def get_items(location, encoding='UTF-8'): """ Pass in a file or file-like object and get a list of Items present in the HTML document. """ dom_builder = html5lib.treebuilders.getTreeBuilder("dom") parser = html5lib.HTMLParser(tree=dom_builder) if (sys.version_info.major == 3): tree = parser.parse(location) else: tree = parser.parse(location, encoding=encoding) return _find_items(tree)
def validate_content(testcase, data, page_descr="unknown page"): """ Validate data as HTML5. testcase should be a unittest.TestCase object (or similar). page_descr should be a human-readable description of the page being tested. """ parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom")) parser.parse(data) if parser.errors: fh = open("tmp-validation.html", "wb") fh.write(data) fh.close() testcase.fail("Invalid HTML5 produced in %s:\n %s" % (page_descr, str(parser.errors)))
def read_xml(filename, mangle_entities=False): """ Read in a document, returning the ElementTree doc node. """ tree = treebuilders.getTreeBuilder('lxml') parser = html5lib.HTMLParser(strict=False, tree=tree) doc = html5parser.parse(filename, parser=parser) if parser.errors: sys.stderr.write('errors in {0}\n'.format(filename)) for e in parser.errors: sys.stderr.write(' {0}\n'.format(e)) return doc
def sanitize_html_string(content): """Sanitizes the given html string. Raises: forms.ValidationError in case of an error. """ try: parser = html5lib.HTMLParser(tokenizer=sanitizer.HTMLSanitizer) parsed = parser.parseFragment(content, encoding='utf-8') cleaned_content = ''.join([tag.toxml() for tag in parsed.childNodes]) except (HTMLParser.HTMLParseError, html5parser.ParseError) as msg: raise forms.ValidationError(msg) return cleaned_content
def clean(self, value): chars = super(HTMLField, self).clean(value) #chars = chars.encode('utf-8') # should really find out where we have decoded input to unicode and do it there instead p = html5lib.HTMLParser(tokenizer=sanitizer.HTMLSanitizer, tree=treebuilders.getTreeBuilder("dom")) # could use Beautiful Soup here instead s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values=True) dom_tree = p.parseFragment(chars) #encoding="utf-8") - unicode input seems to work fine walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) gen = s.serialize(stream) out = "" for i in gen: out += i return out
def get_options(template): parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom")) doc = parser.parse(template) options = {} media_size = doc.documentElement.attributes.get('data-gbclient-media-size') options['media'] = media_size.value if media_size else '62mm' media_orientation = doc.documentElement.attributes.get('data-gbclient-orientation') if media_orientation: options['orientation-requested'] = media_orientation.value return options
def prepare_template_stream(stream, base_url): """Prepares the stream to be stored in the DB""" stream = force_unicode(stream) if stream else u'' tree = treebuilders.getTreeBuilder('lxml') parser = html5lib.HTMLParser(tree=tree, namespaceHTMLElements=False) document_tree = parser.parse(stream) script_elements = document_tree.xpath('//script[@src]') for script in script_elements: src = script.get('src') butter_library = get_butter_library(src) if butter_library: script.set('src', butter_library) make_links_absolute(document_tree, base_url) return _serialize_stream(document_tree)
def test_th_has_no_css_rules(): html = HTML_CONTENT result = BytesIO() pdf = pisaDocument(BytesIO(html.encode('utf-8')), result) parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom")) document = parser.parse(html) th_element = document.getElementsByTagName("th")[0] th_element = CSSDOMElementInterface(th_element) attr_name = "background-color" rules = pdf.cssCascade.findCSSRulesFor(th_element, attr_name) tools.assert_list_equal(rules, [])