def serialize(self, output): walker = html5lib.treewalkers.getTreeWalker("dom") stream = walker(self.document) s = HTMLSerializer(omit_optional_tags=True) output_generator = s.serialize(stream) for item in output_generator: output.write(item.encode('utf-8'))
def clean(self, value, model_instance): """ Validates the given value using the provided HTMLCleaner and returns its "cleaned" value as a Python object. Raises ValidationError for any errors. """ value = super(HTMLField, self).clean(value, model_instance) parser = html5lib.HTMLParser(tokenizer=HTMLSanitizer,tree=treebuilders.getTreeBuilder("dom")) dom_tree = parser.parseFragment(value) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) if self.use_imageproxy: from imageproxy import Proxy user = User.objects.get(pk=getattr(model_instance, self.user_field)) proxy = Proxy(user) stream = ImageProxyFilter(stream, proxy) s = HTMLSerializer(omit_optional_tags=False) output_generator = s.serialize(stream) clean_value = '' for item in output_generator: clean_value += item return clean_value
def app_filter_html_path_inplace(path, filters, log=None): """Filter the given HTML file (in-place) based on "app-*" class attributes. For example, the HTML might contain something like: <div class="app-ide"> ...ide info... </div> <div class="app-edit"> ...edit info... </div> If there are no filters, then the HTML is not changed. If the filters include "ide" but not "edit", then the ide div remains and the edit div is removed. """ if not filters: return if log: log("app-filter `%s'", path) # Parse the HTML file. with open(path) as f: tree = html5lib.parse(f, namespaceHTMLElements=False) # Filter out the unwanted elements. filtered = False assert isinstance(filters, set) for elem in tree.getiterator(): indeces_to_drop = [] for i, child in enumerate(elem.getchildren()): if _should_drop_elem(child, filters, "class", "app-"): indeces_to_drop.insert(0, i) filtered = True if log: tag_str = "<%s" % child.tag if child.attrib: for n, v in child.attrib.items(): tag_str += ' %s="%s"' % (n, v) tag_str += ">" if len(tag_str) > 50: tag_str = tag_str[:47] + '...' log("... filter out %s", tag_str) for idx in indeces_to_drop: del elem[idx] # Write out any changes. if filtered: walker = treewalkers.getTreeWalker("etree", ET) stream = walker(tree) s = HTMLSerializer() outputter = s.serialize(stream) content = ''.join(list(outputter)) f = open(path, 'w') f.write("""<!DOCTYPE html> """) try: f.write(content) finally: f.close()
def independentize_html_path(src, dst, css_dir=None, log=None): """Process the `src' HTML path to `dst' making it independent. - favicon links are removed - CSS references are updated (if `css_dir' is given), else removed. - Relative links are de-linkified. """ if log: log.info("independentize %s %s", src, dst) # Parse the HTML file. with open(src) as f: tree = html5lib.parse(f, namespaceHTMLElements=False) # - Drop favicon links. # - Update or drop CSS links. head = tree.find("head") for link in head.getchildren()[:]: if link.tag != "link": continue rel = link.get("rel", "").split() if "icon" in rel: # this is a favicon link if log: log.debug("%s: remove <link rel='%s'/>", dst, link.get("rel")) head.remove(link) if "stylesheet" in rel: # this is a css ref if css_dir: # update the css dir href = link.get("href") href = posixpath.join(css_dir, posixpath.basename(href)) link.set("href", href) if log: log.debug("%s: update to <link href='%s'/>", dst, href) else: if log: log.debug("%s: remove <link href='%s'/>", dst, link.get("href")) head.remove(link) # De-linkify local references within the full docset. # TODO: Eventually would like to normalize these to point # to online version of the docs. body = tree.find("body") for elem in body.getiterator(): if elem.tag != "a": continue if not elem.get("href"): continue href = elem.get("href") scheme, netloc, path, params, query, fragment = urlparse(href) if scheme or netloc: # externals href continue if path: if log: log.debug("%s: de-linkify <a href='%s'>", dst, href) elem.tag = u"span" # de-linkify # Write out massaged doc. walker = treewalkers.getTreeWalker("etree", ET) stream = walker(tree) s = HTMLSerializer() outputter = s.serialize(stream) content = ''.join(list(outputter)) f = open(dst, 'w') try: f.write(content) finally: f.close()