def save_doc(self, dselector, fname): # really should refactor to use %20 etc escapes, etc from html5lib import serialize s = u'<!DOCTYPE html>' + \ self._build_notice() + \ '%s\n%s' % (serialize (self.d('head')[0], tree="lxml"), serialize (self.d('body')[0], tree="lxml")) + \ '\n</html>\n' self.write(s, fname)
def save_doc (self, dselector, fname): # really should refactor to use %20 etc escapes, etc from html5lib import serialize s = u'<!DOCTYPE html>' + \ ie_shim + \ self.notice % (map_file, self.map_line, dt()) + \ '%s\n%s' % (serialize (self.d('head')[0], tree="lxml"), serialize (self.d('body')[0], tree="lxml")) + \ '\n</html>\n' self.write (s, fname)
def save_doc (self, dselector, fname): # really should refactor to use %20 etc escapes, etc from html5lib import serialize #s = u'<!DOCTYPE html>' + \ # ie_shim + \ s = config.preamble + \ self._build_notice() + \ '%s\n%s' % (serialize (self.d('head')[0], tree="lxml"), serialize (self.d('body')[0], tree="lxml")) + \ '\n</html>\n' self.write (s, fname)
def table_of_contents(html, url='', anchor_type='stacked-number'): index = [0, 0, 0, 0, 0, 0] depth = 0 toc_doc = getDOMImplementation().createDocument(None, 'ol', None) toc = ol = toc_doc.documentElement doc = html5lib.parse(html, treebuilder='dom', namespaceHTMLElements=False) for header in traverse_headings(doc.documentElement): nextdepth = int(header.nodeName[1]) if nextdepth > depth: for i in range(nextdepth, 6): index[i - 1] = 0 for i in range(depth, nextdepth): next_ol = toc_doc.createElement('ol') ol.appendChild(next_ol) ol = next_ol elif nextdepth < depth: for i in range(nextdepth, depth): ol = ol.parentNode depth = nextdepth index[depth - 1] += 1 label = '-'.join([str(index[d]) for d in range(0, depth) if index[d]]) li = toc_doc.createElement('li') a = toc_doc.createElement('a') a.setAttribute('href', '%s#header-%s' % (url, label)) a.appendChild(doc.createTextNode(innerText(header))) li.appendChild(a) ol.appendChild(li) header.setAttribute('id', 'header-' + label) if anchor_type == 'following-marker': anchor = toc_doc.createElement('a') anchor.setAttribute('href', '#header-%s' % label) anchor.setAttribute('class', 'toc-anchor') anchor.appendChild(doc.createTextNode('#')) header.appendChild(anchor) else: anchor = toc_doc.createElement('a') anchor.setAttribute('href', '#header-%s' % label) anchor.setAttribute('class', 'toc-anchor') anchor.appendChild(doc.createTextNode(label)) header.insertBefore(anchor, header.firstChild) ol = toc while not list(filter(lambda node: node.nodeName == 'li', ol.childNodes)) and list(filter(lambda node: node.nodeName == 'ol', ol.childNodes)): ol = list(filter(lambda node: node.nodeName == 'ol', ol.childNodes))[0] ol.setAttribute('class', 'toc') return html5lib.serialize(ol, 'dom'), html5lib.serialize(doc, 'dom')
def table_of_contents(html, url='', anchor_type='stacked-number'): index = [0, 0, 0, 0, 0, 0] depth = 0 toc_doc = getDOMImplementation().createDocument(None, 'ol', None) toc = ol = toc_doc.documentElement doc = html5lib.parse(html, treebuilder='dom', namespaceHTMLElements=False) for header in traverse_headings(doc.documentElement): nextdepth = int(header.nodeName[1]) if nextdepth > depth: for i in range(nextdepth, 6): index[i - 1] = 0 for i in range(depth, nextdepth): next_ol = toc_doc.createElement('ol') ol.appendChild(next_ol) ol = next_ol elif nextdepth < depth: for i in range(nextdepth, depth): ol = ol.parentNode depth = nextdepth index[depth - 1] += 1 label = '-'.join([str(index[d]) for d in range(0, depth) if index[d]]) li = toc_doc.createElement('li') a = toc_doc.createElement('a') a.setAttribute('href', '%s#header-%s' % (url, label)) a.appendChild(doc.createTextNode(innerText(header))) li.appendChild(a) ol.appendChild(li) header.setAttribute('id', 'header-' + label) if anchor_type == 'following-marker': anchor = toc_doc.createElement('a') anchor.setAttribute('href', '#header-%s' % label) anchor.setAttribute('class', 'toc-anchor') anchor.appendChild(doc.createTextNode('#')) header.appendChild(anchor) else: anchor = toc_doc.createElement('a') anchor.setAttribute('href', '#header-%s' % label) anchor.setAttribute('class', 'toc-anchor') anchor.appendChild(doc.createTextNode(label)) header.insertBefore(anchor, header.firstChild) ol = toc while not list(filter(lambda node: node.nodeName == 'li', ol.childNodes)) and list(filter(lambda node: node.nodeName == 'ol', ol.childNodes)): ol = list(filter(lambda node: node.nodeName == 'ol', ol.childNodes))[0] ol.setAttribute('class', 'toc') return html5lib.serialize(ol, 'dom', quote_attr_values=True), html5lib.serialize(doc, 'dom', quote_attr_values=True)
def write(self, outfile): output = "" if self.xpath: p = self.tree.xpath(self.xpath) if p: output = "\n".join(html5lib.serialize(elt, tree="lxml", encoding="utf-8") for elt in p) else: output = html5lib.serialize(self.tree, tree="lxml", encoding="utf-8") open(outfile, 'wb').write(output)
def bench_serialize(loops, fh, treebuilder): fh.seek(0) doc = html5lib.parse(fh, treebuilder=treebuilder, useChardet=False) range_it = range(loops) t0 = pyperf.perf_counter() for loops in range_it: html5lib.serialize(doc, tree=treebuilder, encoding="ascii", inject_meta_charset=False) return pyperf.perf_counter() - t0
def parse_html(content): try: document = html5lib.parse(content, namespaceHTMLElements=False) if not document: # Could not parse return content for parent in document.findall(".//script/.."): for script in parent.findall("script"): replace_script(parent, script) # Because html5lib parses like a browser, it will # always create head and body tags if they are missing. head = document.find("head") SubElement( head, "script", attrib={ "src": static( "content/{filename}".format(filename=get_hashi_filename()) ) }, ) return html5lib.serialize( document, quote_attr_values="always", omit_optional_tags=False, minimize_boolean_attributes=False, use_trailing_solidus=True, space_before_trailing_solidus=False, ) except html5lib.html5parser.ParseError: return content
def parse(self, response): correct_html = html5lib.serialize(html5lib.parse(response.body)) selector = Selector(text=correct_html) page = UkrNetPage() links = selector.xpath("//a/@href").extract() images = selector.xpath("//img/@src").extract() corrected_images = [] for img in images: correct_img = img if img.startswith('//'): correct_img = img.split('//')[1] corrected_images.append(correct_img) text = filter(self.is_str_empty, map(lambda s: s.strip(), [text.extract() for text in response.xpath("//*[not(self::script)]/text()")])) page['url'] = response.request.url page['fragment_img'] = corrected_images page['fragment_text'] = text ukr_net_url = "www.ukr.net" wrong_start = "www." new_links_list = [] for link in links: if ukr_net_url in link: correct_url = link if link.startswith("//"): correct_url = link.split('//')[1] if correct_url.startswith(wrong_start): correct_url = "https://" + correct_url new_links_list.append(correct_url) yield page links = new_links_list[:20] for link in links: yield response.follow(link, callback=self.parse)
def balance(html, limit_words=None, ellipsis=' ...'): if not limit_words: return html5lib.serialize(html5lib.parse(html)) word_count = 0 tokens = tag_re.split(html) out = [] stack = [] for token in tokens: if not token.startswith('<'): if limit_words and word_count > limit_words: break words = token.split() word_count += len(words) if limit_words and word_count > limit_words: crop = limit_words - word_count out.append(' '.join(words[:crop]) + ellipsis) else: out.append(token) continue if token.startswith('<!'): continue if token == ']]>': continue if not token.endswith('>'): continue # invalid element = token[1:-1].split()[0].lower() if not element: continue # invalid if element in banned: element = 'pre' token = '<pre>' if element.startswith('/'): element = element[1:] if element in banned: element = 'pre' token = '</pre>' if element in stack: top = None while stack and top != element: top = stack.pop() out.append('</%s>' % top) continue else: continue if element in block and stack and stack[-1] not in block: # close previous block if any for i in xrange(len(stack) - 1, -1, -1): if stack[i] in block: break stack, previous_block = stack[:i], stack[i:] previous_block.reverse() for tag in previous_block: out.append('</%s>' % tag) if element in closing and not token.endswith('/>'): stack.append(element) out.append(token) # flush the stack out.extend(['</%s>' % element for element in reversed(stack)]) return ''.join(out)
def parse_html(content): try: document = html5lib.parse(content, namespaceHTMLElements=False) if not document: # Could not parse return content # Because html5lib parses like a browser, it will # always create head and body tags if they are missing. head = document.find("head") # Use the makeelement method of the head tag here to ensure that we use the same # Element class for both. Depending on the system and python version we are on, # we may be using the C implementation or the pure python and a mismatch will cause an error. script_tag = head.makeelement("script", {"type": "text/javascript"}) script_tag.text = INITIALIZE_HASHI_FROM_IFRAME head.insert(0, script_tag) # Currently, html5lib strips the doctype, but it's important for correct rendering, so check the original # content for the doctype and, if found, prepend it to the content serialized by html5lib doctype = None try: # Now parse the content as a dom tree instead, so that we capture # any doctype node as a dom node that we can read. tree_builder_dom = html5lib.treebuilders.getTreeBuilder("dom") parser_dom = html5lib.HTMLParser(tree_builder_dom, namespaceHTMLElements=False) tree = parser_dom.parse(content) # By HTML Spec if doctype is included, it must be the first thing # in the document, so it has to be the first child node of the document doctype_node = tree.childNodes[0] # Check that this node is in fact a doctype node if doctype_node.nodeType == doctype_node.DOCUMENT_TYPE_NODE: # render to a string by calling the toxml method # toxml uses single quotes by default, replace with "" doctype = doctype_node.toxml().replace("'", '"') except Exception as e: logger.warn( "Error in HTML5 parsing to determine doctype {}".format(e)) html = html5lib.serialize( document, quote_attr_values="always", omit_optional_tags=False, minimize_boolean_attributes=False, use_trailing_solidus=True, space_before_trailing_solidus=False, ) if doctype: html = doctype + html return html except html5lib.html5parser.ParseError: return content
def balance(html, limit_words=None, ellipsis=' ...'): if not limit_words: return html5lib.serialize(html5lib.parse(html)) word_count = 0 tokens = tag_re.split(html) out = [] stack = [] for token in tokens: if not token.startswith('<'): if limit_words and word_count > limit_words: break words = token.split() word_count += len(words) if limit_words and word_count > limit_words: crop = limit_words - word_count out.append(' '.join(words[:crop]) + ellipsis) else: out.append(token) continue if token.startswith('<!'): continue if token == ']]>': continue if not token.endswith('>'): continue # invalid element = token[1:-1].split()[0].lower() if not element: continue # invalid if element in banned: element = 'pre' token = '<pre>' if element.startswith('/'): element = element[1:] if element in banned: element = 'pre' token = '</pre>' if element in stack: top = None while stack and top != element: top = stack.pop() out.append('</%s>' % top) continue else: continue if element in block and stack and stack[-1] not in block: # close previous block if any for i in range(len(stack) - 1, -1, -1): if stack[i] in block: break stack, previous_block = stack[:i], stack[i:] previous_block.reverse() for tag in previous_block: out.append('</%s>' % tag) if element in closing and not token.endswith('/>'): stack.append(element) out.append(token) # flush the stack out.extend(['</%s>' % element for element in reversed(stack)]) return ''.join(out)
def test_sanitizer(expected, input): parsed = parseFragment(expected) expected = serialize(parsed, omit_optional_tags=False, use_trailing_solidus=True, space_before_trailing_solidus=False, quote_attr_values="always", quote_char='"', alphabetical_attributes=True) assert expected == sanitize_html(input)
def runSanitizerTest(_, expected, input): parsed = parseFragment(expected) expected = serialize(parsed, omit_optional_tags=False, use_trailing_solidus=True, space_before_trailing_solidus=False, quote_attr_values="always", quote_char='"', alphabetical_attributes=True) assert expected == sanitize_html(input)
def sanitize(string): """ Ensure that the text does not contain any malicious HTML code which might break the page. """ from html5lib import parseFragment, serialize parsed = parseFragment(string) clean = serialize(parsed, sanitize=True, omit_optional_tags=False, quote_attr_values='always') return clean
def sanitize_html(stream): parsed = parseFragment(stream) serialized = serialize(parsed, sanitize=True, omit_optional_tags=False, use_trailing_solidus=True, space_before_trailing_solidus=False, quote_attr_values="always", quote_char='"', alphabetical_attributes=True) return serialized
def _html_serialize(self, chunks, attributes, max_length): """Returns concatenated HTML code with SPAN tag. Args: chunks: The list of chunks to be processed. (ChunkList) attributes: If a dictionary, it should be a map of name-value pairs for attributes of output SPAN tags. If a string, it should be a class name of output SPAN tags. If an array, it should be a list of class names of output SPAN tags. (str or dict or list of str) max_length: Maximum length of span enclosed chunk. (int, optional) Returns: The organized HTML code. (str) """ doc = ET.Element('span') for chunk in chunks: if chunk.is_space(): if doc.getchildren(): if doc.getchildren()[-1].tail is None: doc.getchildren()[-1].tail = ' ' else: doc.getchildren()[-1].tail += ' ' else: if doc.text is not None: # We want to preserve space in cases like "Hello 你好" # But the space in " 你好" can be discarded. doc.text += ' ' else: if chunk.has_cjk() and not (max_length and len(chunk.word) > max_length): ele = ET.Element('span') ele.text = chunk.word for k, v in attributes.items(): ele.attrib[k] = v doc.append(ele) else: # add word without span tag for non-CJK text (e.g. English) # by appending it after the last element if doc.getchildren(): if doc.getchildren()[-1].tail is None: doc.getchildren()[-1].tail = chunk.word else: doc.getchildren()[-1].tail += chunk.word else: if doc.text is None: doc.text = chunk.word else: doc.text += chunk.word result = ET.tostring(doc, encoding='utf-8').decode('utf-8') result = html5lib.serialize(html5lib.parseFragment(result), sanitize=True, quote_attr_values="always") return result
def parse_html(content): try: document = html5lib.parse(content, namespaceHTMLElements=False) if not document: # Could not parse return content # Because html5lib parses like a browser, it will # always create head and body tags if they are missing. head = document.find("head") for file in get_files("htmlScreenshot", "js"): SubElement(head, "script", attrib={"src": file['url']}) # Currently, html5lib strips the doctype, but it's important for correct rendering, so check the original # content for the doctype and, if found, prepend it to the content serialized by html5lib doctype = None try: # Now parse the content as a dom tree instead, so that we capture # any doctype node as a dom node that we can read. tree_builder_dom = html5lib.treebuilders.getTreeBuilder("dom") parser_dom = html5lib.HTMLParser(tree_builder_dom, namespaceHTMLElements=False) tree = parser_dom.parse(content) # By HTML Spec if doctype is included, it must be the first thing # in the document, so it has to be the first child node of the document doctype_node = tree.childNodes[0] # Check that this node is in fact a doctype node if doctype_node.nodeType == doctype_node.DOCUMENT_TYPE_NODE: # render to a string by calling the toxml method # toxml uses single quotes by default, replace with "" doctype = doctype_node.toxml().replace("'", '"') except Exception as e: logging.warn( "Error in HTML5 parsing to determine doctype {}".format(e)) html = html5lib.serialize( document, quote_attr_values="always", omit_optional_tags=False, minimize_boolean_attributes=False, use_trailing_solidus=True, space_before_trailing_solidus=False, ) if doctype: html = doctype + html return html except html5lib.html5parser.ParseError: return content
def save_template_once (self, dselector, fname): "Save partial template for later editing; don't overwrite!" from html5lib import serialize ext = fname.lower().split ('.') [-1] #s = dselector.outerHtml() s = serialize (dselector, tree="lxml") if ext == 'minaml': s = convert_to_shpaml (s) elif ext == 'html': pass # s = s else: print 'NYI' # TODO: compiled html template, yaml for obdject / declaration if trace: print 'SAVING ONCE:', fname self.write (s, fname, overwrite=False)
def save_template_once(self, dselector, fname): "Save partial template for later editing; don't overwrite!" from html5lib import serialize ext = fname.lower().split('.')[-1] #s = dselector.outerHtml() s = serialize(dselector, tree="lxml") if ext == 'minaml': s = convert_to_shpaml(s) elif ext == 'html': pass # s = s else: print 'NYI' # TODO: compiled html template, yaml for obdject / declaration if trace: print 'SAVING ONCE:', fname self.write(s, fname, overwrite=False)
def markdown(value): # Renders the string using CommonMark in safe mode, which blocks # raw HTML in the input and also some links using a blacklist, # plus a second pass filtering using a whitelist for allowed # tags and URL schemes. import CommonMark ast = CommonMark.Parser().parse(force_unicode(value)) html = CommonMark.HtmlRenderer({'safe': True}).render(ast) import html5lib, urlparse def filter_url(url): try: urlp = urlparse.urlparse(url) except Exception as e: # invalid URL return None if urlp.scheme not in ("http", "https"): return None return url valid_tags = set( 'strong em a code p h1 h2 h3 h4 h5 h6 pre br hr img ul ol li span blockquote' .split()) valid_tags = set('{http://www.w3.org/1999/xhtml}' + tag for tag in valid_tags) dom = html5lib.HTMLParser().parseFragment(html) for node in dom.iter(): if node.tag not in valid_tags and node.tag != 'DOCUMENT_FRAGMENT': node.tag = '{http://www.w3.org/1999/xhtml}span' for name, val in node.attrib.items(): if name.lower() in ("href", "src"): val = filter_url(val) if val is None: node.attrib.pop(name) else: node.set(name, val) else: # No other attributes are permitted. node.attrib.pop(name) html = html5lib.serialize(dom, quote_attr_values="always", omit_optional_tags=False, alphabetical_attributes=True) return safestring.mark_safe(html)
def runtest(self): input = self.test["input"] expected = self.test["output"] parsed = parseFragment(input) serialized = serialize(parsed, sanitize=True, omit_optional_tags=False, use_trailing_solidus=True, space_before_trailing_solidus=False, quote_attr_values="always", quote_char="'", alphabetical_attributes=True) errorMsg = "\n".join(["\n\nInput:", input, "\nExpected:", expected, "\nReceived:", serialized]) assert expected == serialized, errorMsg
def build_page(filepath: str) -> None: with open(filepath, "rb") as f: document: et.Element = html5lib.parse(f) name, _ = os.path.splitext(os.path.split(filepath)[1]) page = Page.objects.get(name=name) page.order = int(select('meta[name=sfs_order]', document).get_attr('content') or 0) page.active = parse_bool_attr_value(select('meta[name=sfs_active]', document).get_attr('content')) page.icon = select('meta[name=sfs_icon]', document).get_attr('content') or '' page.title = select('title', document).text parent_name = select('meta[name=sfs_parent]', document).get_attr('content') if parent_name: parent_page = Page.objects.get(name=parent_name) page.parent = parent_page body = select('body', document).get(0) page.content = html5lib.serialize(body) page.save()
def parse(self, response): correct_html = html5lib.serialize(html5lib.parse(response.body)) selector = Selector(text=correct_html) products = selector.xpath(self.selectors['product'])[:20] for prod in products: item = RepkaItem() name = prod.xpath(self.selectors['product-name']).extract()[0] url = prod.xpath(self.selectors['product-url']).extract()[0] price = prod.xpath(self.selectors['product-price']).extract()[0] image = prod.xpath(self.selectors['product-image']).extract()[0] descr_titles = prod.xpath( self.selectors["product-description-title"]).extract() descr_info = prod.xpath( self.selectors["product-description-info"]).extract() descr_info = list(filter(self.is_str_empty, descr_info)) item['name'] = name item['url'] = url item['price'] = price item['image'] = image item['description_titles'] = descr_titles item['description_info'] = descr_info yield item
def html_serialize(self, attributes, max_length=None): """Returns concatenated HTML code with SPAN tag. Args: attributes (dict): A map of name-value pairs for attributes of output SPAN tags. max_length (:obj:`int`, optional): Maximum length of span enclosed chunk. Returns: The organized HTML code. (str) """ doc = ET.Element('span') for chunk in self: if (chunk.has_cjk() and not (max_length and len(chunk.word) > max_length)): ele = ET.Element('span') ele.text = chunk.word for key, val in attributes.items(): ele.attrib[key] = val doc.append(ele) else: # add word without span tag for non-CJK text (e.g. English) # by appending it after the last element if doc.getchildren(): if doc.getchildren()[-1].tail is None: doc.getchildren()[-1].tail = chunk.word else: doc.getchildren()[-1].tail += chunk.word else: if doc.text is None: doc.text = chunk.word else: doc.text += chunk.word result = ET.tostring(doc, encoding='utf-8').decode('utf-8') result = html5lib.serialize(html5lib.parseFragment(result), sanitize=True, quote_attr_values='always') return result
def _inject_session(self, session, html): serialized = session.serialize() def _walk(node): for child in node.childNodes: _walk(child) if node.name in self.session_url_rewrite_map: for attr in self.session_url_rewrite_map[node.name]: value = node.attributes.get(attr) if value is None: continue new_value = self._rewrite_session_url(value, serialized) node.attributes[attr] = new_value elif node.name == 'form': hidden = Element('input') hidden.attributes.update( type='hidden', name=self.session_url_key, value=serialized ) node.childNodes.append(hidden) tree = _parser.parse(html) _walk(tree) return serialize(tree)
def render_content(self, content_text: str) -> str: document: et.Element = html5lib.parse(content_text) body = select('body', document).get(0) # Process src attributes for element in select('*[data-sfs-src]', body): src = element.get_attr('src') mode = element.get_attr('data-sfs-src') if mode == 'static': element.set_attr('src', static(src)) elif mode == 'file': element.set_attr('src', file(src)) element.del_attr('data-sfs-src') # Process anchor hrefs for a in select('a[data-sfs-href]', body): name = a.get_attr('href') a.set_attr('href', reverse('page', kwargs={'name': name})) a.del_attr('data-sfs-href') return html5lib.serialize(body)
def markdown(value): # Renders the string using CommonMark in safe mode, which blocks # raw HTML in the input and also some links using a blacklist, # plus a second pass filtering using a whitelist for allowed # tags and URL schemes. import cmarkgfm from cmarkgfm.cmark import Options as cmarkgfmOptions html = cmarkgfm.github_flavored_markdown_to_html( value, options=cmarkgfmOptions.CMARK_OPT_SAFE) import html5lib, urllib.parse def filter_url(url): try: urlp = urllib.parse.urlparse(url) except Exception as e: # invalid URL return None if urlp.scheme not in ("http", "https"): return None return url valid_tags = set( 'strong em a code p h1 h2 h3 h4 h5 h6 pre br hr img ul ol li span blockquote' .split()) valid_tags = set('{http://www.w3.org/1999/xhtml}' + tag for tag in valid_tags) dom = html5lib.HTMLParser().parseFragment(html) for node in dom.iter(): if node.tag not in valid_tags and node.tag != 'DOCUMENT_FRAGMENT': node.tag = '{http://www.w3.org/1999/xhtml}span' for name, val in list(node.attrib.items()): if name.lower() in ("href", "src"): val = filter_url(val) if val is None: node.attrib.pop(name) else: node.set(name, val) else: # No other attributes are permitted. node.attrib.pop(name) # If there is an h1 in the output, demote all of the headings # so we don't create something that interfere's with the page h1. hash1 = False for node in dom.iter(): if node.tag in ("h1", "{http://www.w3.org/1999/xhtml}h1"): hash1 = True if hash1: for node in dom.iter(): m = re.match("(\{http://www.w3.org/1999/xhtml\})?h(\d)$", node.tag) if m: node.tag = (m.group(1) or "") + "h" + str(int(m.group(2)) + 1) html = html5lib.serialize(dom, quote_attr_values="always", omit_optional_tags=False, alphabetical_attributes=True) return safestring.mark_safe(html)
def _save (self, content, fname, overwrite=True): from html5lib import serialize s = serialize (content, tree="lxml") self.content = s self.write (s, fname, overwrite)
def render_sanitized_html(html): parser = HTMLParser(tokenizer=HTMLSanitizer) etree = parser.parse(html) return serialize(etree)
def _save(self, content, fname, overwrite=True): from html5lib import serialize s = serialize(content, tree="lxml") self.content = s self.write(s, fname, overwrite)