def preprocess(self, node, nextnode): """ Preprocess the XHTML tree generated by `markdown.convert()`. """ # --- inject linebreaks between subsequent nested paragraphs ---------- if node.tag in ('li', 'blockquote'): prev = None index = 0 minindex = 1 if node.tag == 'li' else 0 lb = etree.Element('br') for child in list(node): # For whatever reason, the first linebreak in a list item # introduces a new paragraph in GCW while subsequent linebreaks # are handled as whitespace, i.e. in these cases an explicit # <br/> is needed. if index > minindex and prev.tag == 'p' and child.tag == 'p': log("using <br/> to fake nested paragraph '%s'" % truncate(child.text or "...", 15)) node.insert(index, lb) index += 1 index += 1 prev = child # --- replace <abbr> by <span> ---------------------------------------- if node.tag == 'abbr': log("replacing <abbr> by <span> ('%s')" % truncate(node.text, 15)) node.tag = 'span' # --- collapse <pre><code> to <pre> ----------------------------------- precodeblock = (node.tag == 'pre' and len(node) == 1 and not node.text and node[0].tag == 'code' and not node[0].tail) if precodeblock: child = node[0] node.clear() node.text = child.text # --- whitespace cleanup ---------------------------------------------- node.text = node.text or "" node.tail = node.tail or "" if not (node and node[0].tag in SPANLEVELTAGS): node.text = node.text.strip("\n") if (not (node.tag in SPANLEVELTAGS and node.tail) and not (nextnode and nextnode.tag in SPANLEVELTAGS)): node.tail = node.tail.strip("\n") if node.tag != 'pre': node.text = re.sub(r'\s+', ' ', node.text or "") else: node.text = textwrap.dedent(node.text) assert not node node.tail = re.sub(r'\s+', ' ', node.tail or "") # --- prefix image urls ----------------------------------------------- if node.tag == 'img': isrc = node.attrib['src'] if not RXABSURL.search(isrc): isrc = "%s%s" % (self.mdx.imagebaseurl, isrc) if not RXABSURLX.search(isrc): raise BadURL(isrc) if not RXIMGEXT.search(isrc): conn = "&" if "?" in isrc else "?" isrc = "%s%sx=x.png" % (isrc, conn) log("appending artificial image file extension (%s)" % isrc) node.attrib['src'] = isrc # --- check link URLs ------------------------------------------------- if node.tag == 'a': url = node.attrib['href'] if RXABSURL.search(url): if not RXABSURLX.search(url): raise BadURL(url) elif not RXPAGENAME.search(url): raise BadURL(url) # --- traverse child nodes -------------------------------------------- for child, nextnode in izip_longest(node, node[1:]): self.preprocess(child, nextnode)
def a(self, _front, text, attrib): if attrib['html']: log("using an HTML link for '%s'" % text) return self.element('a', text, attrib) return "[%s %s]" % (attrib['href'], text)