def __init__(self): self.current_node = moin_page.body() self.root = moin_page.page(children=(self.current_node, )) self.path = [self.root, self.current_node] self.header_size = 1 self.status = ['document'] self.footnotes = dict()
def __call__(self, data, contenttype=None, arguments=None): """ Function called by the converter to process the conversion. TODO: Add support for different arguments """ text = decode_data(data, contenttype) content = normalize_split_text(text) # Be sure we have empty string in the base url self.base_url = '' # We create an element tree from the HTML content # The content is a list of string, line per line # We can concatenate all in one string html_str = '' html_str = html_str.join(content) html_tree = HTML(html_str) # We should have a root element, which will be converted as <page> # for the DOM Tree. It can be <html> or <div>. # NB : If <html> used, it will be converted back to <div> after # one roundtrip if html_tree.tag.name != 'html' and html_tree.tag.name != 'div': html_str = ''.join(['<div>', html_str, '</div>']) html_tree = HTML(html_str) # Start the conversion of the first element # Every child of each element will be recursively convert too element = self.do_children(html_tree) # Add Global element to our DOM Tree body = moin_page.body(children=element) root = moin_page.page(children=[body]) return root
def __call__(self, rev, contenttype=None, arguments=None): item_name = rev.item.name query_keys = {'do': 'get', 'rev': rev.revid} attrib = {} if arguments: query = arguments.keyword.get(xinclude.href) if query and query.query: # query.query value is similar to "w=75" given a transclusion "{{jpeg||&w=75 class="top"}}" query_keys.update(url_decode(query.query)) attrib = arguments.keyword query = url_encode(query_keys, charset=CHARSET, encode_keys=True) attrib.update({ moin_page.type_: unicode(self.input_type), xlink.href: Iri(scheme='wiki', authority='', path='/' + item_name, query=query), }) obj = moin_page.object_(attrib=attrib, children=[ item_name, ]) body = moin_page.body(children=(obj, )) return moin_page.page(children=(body, ))
def __call__(self, content, arguments=None): iter_content = _Iter(content) body = self.parse_block(iter_content, arguments) root = moin_page.page(children=[body]) return root
def block_nowiki_repl(self, iter_content, stack, nowiki, nowiki_marker, nowiki_interpret=None, nowiki_name=None, nowiki_args=None, nowiki_args_old=None): stack.clear() nowiki_marker_len = len(nowiki_marker) lines = _Iter(self.block_nowiki_lines(iter_content, nowiki_marker_len), startno=iter_content.lineno) if nowiki_interpret: if nowiki_args: args = parse_arguments(nowiki_args) elif nowiki_args_old: args = Arguments(keyword={'_old': nowiki_args_old}) else: args = None logging.debug("nowiki_name: %r" % nowiki_name) # Parse it directly if the type is ourself if not nowiki_name or nowiki_name == 'wiki': body = self.parse_block(lines, args) elem = moin_page.page(children=(body, )) stack.top_append(elem) return stack.top_append(self.parser(nowiki_name, args, lines)) return elem = moin_page.blockcode() stack.top_append(elem) for line in lines: if len(elem): elem.append('\n') elem.append(line)
def __call__(self, data, contenttype=None, arguments=None): text = decode_data(data, contenttype) content = normalize_split_text(text) # as of py 2.7.x (and in the year 2013), the csv module seems to still # have troubles with unicode, thus we encode to utf-8 ... content = [line.encode('utf-8') for line in content] dialect = csv.Sniffer().sniff(content[0]) reader = csv.reader(content, dialect) # ... and decode back to unicode rows = [] for encoded_row in reader: row = [] for encoded_cell in encoded_row: row.append(encoded_cell.decode('utf-8')) if row: rows.append(row) head = None cls = None try: # fragile function throws errors when csv file is incorrectly formatted if csv.Sniffer().has_header('\n'.join(content)): head = rows[0] rows = rows[1:] cls = 'moin-sortable' except csv.Error as e: head = [_('Error parsing CSV file:'), str(e)] table = self.build_dom_table(rows, head=head, cls=cls) body = moin_page.body(children=(table, )) return moin_page.page(children=(body, ))
def __call__(self, content, arguments=None): content = u'\n'.join(content) blockcode = moin_page.blockcode( attrib={moin_page.class_: 'highlight'}) pygments.highlight(content, self.lexer, TreeFormatter(), blockcode) body = moin_page.body(children=(blockcode, )) return moin_page.page(children=(body, ))
def __call__(self, content, arguments=None): iter_content = _Iter(content) self.preprocessor = self.Mediawiki_preprocessor() body = self.parse_block(iter_content, arguments) root = moin_page.page(children=(body, )) return root
def __call__(self, data, contenttype=None, arguments=None): text = decode_data(data, contenttype) content = normalize_split_text(text) content = u'\n'.join(content) blockcode = moin_page.blockcode(attrib={moin_page.class_: 'highlight'}) pygments.highlight(content, self.lexer, TreeFormatter(), blockcode) body = moin_page.body(children=(blockcode, )) return moin_page.page(children=(body, ))
def __call__(self, rev, contenttype=None, arguments=None): item_name = rev.item.fqname.value attrib = { xlink.href: Iri(scheme='wiki', authority='', path='/' + item_name, query='do=modify'), } a = moin_page.a(attrib=attrib, children=[_("%(item_name)s does not exist. Create it?", item_name=item_name)]) body = moin_page.body(children=(a, )) return moin_page.page(children=(body, ))
def __call__(self, content, arguments=None): """Parse the text and return DOM tree.""" blockcode = moin_page.blockcode() for line in content: if len(blockcode): blockcode.append('\n') blockcode.append(line.expandtabs()) body = moin_page.body(children=(blockcode, )) return moin_page.page(children=(body, ))
def __call__(self, rev, contenttype=None, arguments=None): item_name = rev.item.name attrib = { xlink.href: Iri(scheme='wiki', authority='', path='/' + item_name, query='do=get&rev={0}'.format(rev.revid)), } a = moin_page.a(attrib=attrib, children=[u"Download {0}.".format(item_name)]) body = moin_page.body(children=(a, )) return moin_page.page(children=(body, ))
def __call__(self, data, contenttype=None, arguments=None): text = decode_data(data, contenttype) lines = normalize_split_text(text) iter_content = _Iter(lines) body = self.parse_block(iter_content, arguments) root = moin_page.page(children=[body]) return root
def __call__(self, data, contenttype=None, arguments=None): text = decode_data(data, contenttype) lines = normalize_split_text(text) iter_content = _Iter(lines) body = self.parse_block(iter_content, arguments) root = moin_page.page(children=(body, )) return root
def __call__(self, data, contenttype=None, arguments=None): text = decode_data(data, contenttype) content = normalize_split_text(text) iter_content = _Iter(content) self.preprocessor = self.Mediawiki_preprocessor() body = self.parse_block(iter_content, arguments) root = moin_page.page(children=(body, )) return root
def __call__(self, data, contenttype=None, arguments=None): text = decode_data(data, contenttype) content = normalize_split_text(text) blockcode = moin_page.blockcode() for line in content: if len(blockcode): blockcode.append('\n') blockcode.append(line.expandtabs()) body = moin_page.body(children=(blockcode, )) return moin_page.page(children=(body, ))
def __call__(self, rev, contenttype=None, arguments=None): item_name = rev.item.name attrib = { moin_page.type_: unicode(self.input_type), xlink.href: Iri(scheme='wiki', authority='', path='/' + item_name, query='do=get&rev={0}'.format(rev.revid)), } obj = moin_page.object_(attrib=attrib, children=[u'Your Browser does not support HTML5 audio/video element.', ]) body = moin_page.body(children=(obj, )) return moin_page.page(children=(body, ))
def __call__(self, rev, contenttype=None, arguments=None): item_name = rev.item.name attrib = { moin_page.type_: unicode(self.input_type), xlink.href: Iri( scheme="wiki", authority="", path="/" + item_name, query="do=get&rev={0}".format(rev.revid) ), } obj = moin_page.object_(attrib=attrib, children=[item_name]) body = moin_page.body(children=(obj,)) return moin_page.page(children=(body,))
def __call__(self, data, contenttype=None, arguments=None): """ Function called by the converter to process the conversion. TODO: Add support for different arguments """ text = decode_data(data, contenttype) # data cleanup is not needed by html_out, but is needed by moinwiki_out; CKEditor adds unwanted \n\t while '\t\t' in text: text = text.replace('\t\t', '\t') text = text.replace('\r\n\t', '').replace('\n\t', '') content = normalize_split_text(text) # Be sure we have empty string in the base url self.base_url = '' # We create an element tree from the HTML content # The content is a list of string, line per line # We can concatenate all in one string html_str = u'\n'.join(content) try: html_tree = HTML(html_str) except AssertionError as reason: # we suspect user has created or uploaded malformed HTML, try to show input as preformatted code msg = _('Error: malformed HTML: {reason}.').format(reason=reason) msg = '<div class="error"><p><strong>%s</strong></p></div>' % msg html_str = ''.join(['<html>', msg, '<pre>', html_str, '</pre></html>']) try: html_tree = HTML(html_str) except ValueError: msg = _('Error: malformed HTML. Try viewing source with Highlight or Modify links.') msg = '<div class="error"><p><strong>%s</strong></p></div>' % msg html_str = ''.join(['<html>', msg, '</html>']) html_tree = HTML(html_str) # We should have a root element, which will be converted as <page> # for the DOM Tree. # NB : If <html> used, it will be converted back to <div> after # one roundtrip if html_tree.tag.name != 'html': html_str = ''.join(['<div>', html_str, '</div>']) html_tree = HTML(html_str) # Start the conversion of the first element # Every child of each element will be recursively convert too element = self.do_children(html_tree) # Add Global element to our DOM Tree body = moin_page.body(children=element) root = moin_page.page(children=[body]) return root
def __call__(self, rev, contenttype=None, arguments=None): self.item_name = rev.item.name try: contents = self.list_contents(rev.data) contents = [(self.process_size(size), self.process_datetime(dt), self.process_name(name), ) for size, dt, name in contents] table = self.build_dom_table(contents, head=[_("Size"), _("Timestamp"), _("Name")], cls='zebra') body = moin_page.body(children=(table, )) return moin_page.page(children=(body, )) except ArchiveException as err: logging.exception("An exception within archive file handling occurred:") # XXX we also use a table for error reporting, could be # something more adequate, though: return self.build_dom_table([[str(err)]])
def __call__(self, rev, contenttype=None, arguments=None): item_name = rev.item.name attrib = { moin_page.type_: unicode(self.input_type), xlink.href: Iri(scheme='wiki', authority='', path='/' + item_name, query='do=get&rev={0}'.format(rev.revid)), } obj = moin_page.object_( attrib=attrib, children=[ u'Your Browser does not support HTML5 audio/video element.', ]) body = moin_page.body(children=(obj, )) return moin_page.page(children=(body, ))
def block_nowiki_repl(self, iter_content, stack, nowiki): """Handles a complete nowiki block""" stack.clear() try: firstline = iter_content.next() except StopIteration: stack.push(moin_page.blockcode()) return # Stop directly if we got an end marker in the first line match = self.nowiki_end_re.match(firstline) if match and not match.group('escape'): stack.push(moin_page.blockcode()) return lines = _Iter(self.block_nowiki_lines(iter_content), startno=iter_content.lineno) match = self.nowiki_interpret_re.match(firstline) if match: name = match.group('nowiki_name') args = match.group('nowiki_args') if args: args = parse_arguments(args) # Parse it directly if the type is ourself if not name or name == 'creole': body = self.parse_block(lines, args) elem = moin_page.page(children=(body, )) stack.top_append(elem) else: stack.top_append(self.parser(name, args, lines)) else: elem = moin_page.blockcode(children=(firstline, )) stack.top_append(elem) for line in lines: elem.append('\n') elem.append(line)
def block_nowiki_repl(self, iter_content, stack, nowiki): """Handles a complete nowiki block""" stack.clear() try: firstline = iter_content.next() except StopIteration: stack.push(moin_page.blockcode()) return # Stop directly if we got an end marker in the first line match = self.nowiki_end_re.match(firstline) if match and not match.group("escape"): stack.push(moin_page.blockcode()) return lines = _Iter(self.block_nowiki_lines(iter_content), startno=iter_content.lineno) match = self.nowiki_interpret_re.match(firstline) if match: name = match.group("nowiki_name") args = match.group("nowiki_args") if args: args = parse_arguments(args) # Parse it directly if the type is ourself if not name or name == "creole": body = self.parse_block(lines, args) elem = moin_page.page(children=(body,)) stack.top_append(elem) else: stack.top_append(self.parser(name, args, lines)) else: elem = moin_page.blockcode(children=(firstline,)) stack.top_append(elem) for line in lines: elem.append("\n") elem.append(line)
def __call__(self, data, contenttype=None, arguments=None): text = decode_data(data, contenttype) content = normalize_split_text(text) # as of py 2.7.x (and in the year 2013), the csv module seems to still # have troubles with unicode, thus we encode to utf-8 ... content = [line.encode('utf-8') for line in content] dialect = csv.Sniffer().sniff(content[0]) reader = csv.reader(content, dialect) # ... and decode back to unicode rows = [] for encoded_row in reader: row = [] for encoded_cell in encoded_row: row.append(encoded_cell.decode('utf-8')) if row: rows.append(row) table = self.build_dom_table(rows) body = moin_page.body(children=(table, )) return moin_page.page(children=(body, ))
def __call__(self, rev, contenttype=None, arguments=None): item_name = rev.item.name query_keys = {'do': 'get', 'rev': rev.revid} attrib = {} if arguments: query = arguments.keyword.get(xinclude.href).query if query: query_keys.update(url_decode(query)) attrib = arguments.keyword query = url_encode(query_keys, charset=CHARSET, encode_keys=True) attrib.update({ moin_page.type_: unicode(self.input_type), xlink.href: Iri(scheme='wiki', authority='', path='/' + item_name, query=query), }) obj = moin_page.object_(attrib=attrib, children=[item_name, ]) body = moin_page.body(children=(obj, )) return moin_page.page(children=(body, ))
def __call__(self, rev, contenttype=None, arguments=None): self.item_name = rev.item.name try: contents = self.list_contents(rev.data) contents = [( self.process_size(size), self.process_datetime(dt), self.process_name(name), ) for size, dt, name in contents] table = self.build_dom_table( contents, head=[_("Size"), _("Timestamp"), _("Name")], cls='zebra') body = moin_page.body(children=(table, )) return moin_page.page(children=(body, )) except ArchiveException as err: logging.exception( "An exception within archive file handling occurred:") # XXX we also use a table for error reporting, could be # something more adequate, though: return self.build_dom_table([[str(err)]])
class Converter(object): # {{{ html conversion # HTML tags which can be converted directly to the moin_page namespace symmetric_tags = set(['div', 'p', 'strong', 'code', 'quote', 'blockquote']) # HTML tags to define a list, except dl which is a little bit different list_tags = set(['ul', 'ol']) # HTML tags which can be convert without attributes in a different DOM tag simple_tags = { # Emphasis 'em': moin_page.emphasis, 'i': moin_page.emphasis, # Strong 'b': moin_page.strong, 'strong': moin_page.strong, # Code and Blockcode 'pre': moin_page.blockcode, 'tt': moin_page.code, 'samp': moin_page.code, # Lists 'dl': moin_page.list_item, 'dt': moin_page.list_item_label, 'dd': moin_page.list_item_body, # Table - th and td require special processing for alignment of cell contents 'table': moin_page.table, 'thead': moin_page.table_header, 'tbody': moin_page.table_body, 'tr': moin_page.table_row, } # HTML Tag which does not have equivalence in the DOM Tree # But we keep the information using <span element> inline_tags = set(['abbr', 'acronym', 'address', 'dfn', 'kbd']) # HTML tags which are completely ignored by our converter. # We even do not process children of these elements. ignored_tags = set([ 'applet', 'area', 'button', 'caption', 'center', 'fieldset', 'form', 'frame', 'frameset', 'head', 'iframe', 'input', 'isindex', 'label', 'legend', 'link', 'map', 'menu', 'noframes', 'noscript', 'optgroup', 'option', 'param', 'script', 'select', 'style', 'textarea', 'title', 'var', ]) # standard_attributes are html attributes which are used # directly in the DOM tree, without any conversion standard_attributes = set(['title', 'class', 'style']) # Regular expression to detect an html heading tag heading_re = re.compile('h[1-6]') def new(self, tag, attrib, children): """ Return a new element for the DOM Tree """ return ET.Element(tag, attrib=attrib, children=children) def new_copy(self, tag, element, attrib): """ Function to copy one element to the DOM Tree. It first converts the child of the element, and the element itself. """ attrib_new = self.convert_attributes(element) attrib.update(attrib_new) children = self.do_children(element) return self.new(tag, attrib, children) def new_copy_symmetric(self, element, attrib): """ Create a new QName, with the same tag of the element, but with a different namespace. Then, we handle the copy normally. """ tag = ET.QName(element.tag, moin_page) return self.new_copy(tag, element, attrib) def convert_attributes(self, element): result = {} for key, value in element.attrib.iteritems(): if key in self.standard_attributes: result[html(key)] = value if key == 'id': result[xml('id')] = value return result def visit_heading(self, element): """ Function to convert an heading tag into a proper element in our moin_page namespace """ heading_level = element.tag[1] key = moin_page('outline-level') attrib = {} attrib[key] = heading_level return self.new_copy(moin_page.h, element, attrib) def visit_br(self, element): return moin_page.line_break() def visit_big(self, element): key = moin_page('font-size') attrib = {} attrib[key] = '120%' return self.new_copy(moin_page.span, element, attrib) def visit_small(self, element): key = moin_page('font-size') attrib = {} attrib[key] = '85%' return self.new_copy(moin_page.span, element, attrib) def visit_sub(self, element): key = moin_page('baseline-shift') attrib = {} attrib[key] = 'sub' return self.new_copy(moin_page.span, element, attrib) def visit_sup(self, element): key = moin_page('baseline-shift') attrib = {} attrib[key] = 'super' return self.new_copy(moin_page.span, element, attrib) def visit_u(self, element): key = moin_page('text-decoration') attrib = {} attrib[key] = 'underline' return self.new_copy(moin_page.span, element, attrib) def visit_ins(self, element): key = moin_page('text-decoration') attrib = {} attrib[key] = 'underline' return self.new_copy(moin_page.span, element, attrib) def visit_del(self, element): key = moin_page('text-decoration') attrib = {} attrib[key] = 'line-through' return self.new_copy(moin_page.span, element, attrib) def visit_s(self, element): key = moin_page('text-decoration') attrib = {} attrib[key] = 'line-through' return self.new_copy(moin_page.span, element, attrib) def visit_strike(self, element): key = moin_page('text-decoration') attrib = {} attrib[key] = 'line-through' return self.new_copy(moin_page.span, element, attrib) def visit_hr(self, element, default_class=u'moin-hr3'): return self.new_copy(moin_page.separator, element, {moin_page.class_: default_class}) def visit_img(self, element): """ <img src="URI" /> --> <object xlink:href="URI /> """ attrib = {} url = Iri(element.attrib.get('src')) if element.attrib.get('alt'): attrib[html.alt] = element.attrib.get('alt') if url.scheme is None: # img tag target = Iri(scheme='wiki.local', path=element.attrib.get("src"), fragment=None) attrib[xinclude.href] = target new_node = xinclude.include(attrib=attrib) else: # object tag attrib[xlink.href] = url new_node = moin_page.object(attrib) return new_node def visit_object(self, element): """ <object data="href"></object> --> <object xlink="href" /> """ key = xlink('href') attrib = {} if self.base_url: attrib[key] = ''.join([self.base_url, element.get(html.data)]) else: attrib[key] = element.get(html.data) # Convert the href attribute into unicode attrib[key] = unicode(attrib[key]) return moin_page.object(attrib) def visit_inline(self, element): """ For some specific inline tags (defined in inline_tags) We just return <span element="tag.name"> """ key = html.class_ attrib = {} attrib[key] = ''.join(['html-', element.tag.name]) return self.new_copy(moin_page.span, element, attrib) def visit_li(self, element): """ NB : A list item (<li>) is like the following snippet:: <list-item> <list-item-label>label</list-item-label> <list-item-body>Body</list-item-body> </list-item> For <li> element, there is no label """ list_item_body = ET.Element(moin_page.list_item_body, attrib={}, children=self.do_children(element)) return ET.Element(moin_page.list_item, attrib={}, children=[list_item_body]) def visit_list(self, element): """ Convert a list of item (whatever the type : ordered or unordered) So we have html code like:: <ul> <li>Item 1</li> <li>Item 2</li> </ul> Which will be converted to:: <list> <list-item> <list-item-body>Item 1</list-item-body> </list-item> <list-item> <list-item-body>Item 2</list-item-body> </list-item> </list> """ # We will define the appropriate attribute # according to the type of the list attrib = {} if element.tag == "ul" or element.tag == "dir": attrib[moin_page('item-label-generate')] = 'unordered' elif element.tag == "ol": attrib[moin_page('item-label-generate')] = 'ordered' return ET.Element(moin_page.list, attrib=attrib, children=self.do_children(element)) def visit_a(self, element): key = xlink('href') attrib = {} href = postproc_text(self.markdown, element.attrib.get("href")) if allowed_uri_scheme(href): attrib[key] = href else: return href return self.new_copy(moin_page.a, element, attrib) def convert_align_to_class(self, attrib): attr = {} alignment = attrib.get('align') if alignment in (u'right', u'center', u'left'): attr[moin_page.class_] = alignment return attr def visit_th(self, element): attrib = self.convert_align_to_class(element.attrib) return self.new_copy(html.th, element, attrib=attrib) def visit_td(self, element): attrib = self.convert_align_to_class(element.attrib) return self.new_copy(html.td, element, attrib=attrib) def visit(self, element): # Our element can be converted directly, just by changing the namespace if element.tag in self.symmetric_tags: return self.new_copy_symmetric(element, attrib={}) # Our element is enough simple to just change the tag name if element.tag in self.simple_tags: return self.new_copy(self.simple_tags[element.tag], element, attrib={}) # Our element defines a list if element.tag in self.list_tags: return self.visit_list(element) # We convert our element as a span tag with element attribute if element.tag in self.inline_tags: return self.visit_inline(element) # We have a heading tag if self.heading_re.match(element.tag): return self.visit_heading(element) # Otherwise we need a specific procedure to handle it method_name = 'visit_' + element.tag method = getattr(self, method_name, None) if method: return method(element) # We should ignore this tag if element.tag in self.ignored_tags: logging.info("INFO : Ignored tag : {0}".format(element.tag)) return logging.info("INFO : Unhandled tag : {0}".format(element.tag)) return def do_children(self, element, add_lineno=False): new = [] # markdown parser surrounds child nodes with unwanted u"\n" children, here we remove leading \n if hasattr( element, "text") and element.text is not None and element.text != u'\n': new.append(postproc_text(self.markdown, element.text)) for child in element: r = self.visit(child) if r is None: r = () elif not isinstance(r, (list, tuple)): if add_lineno and self.line_numbers: r.attrib[html.data_lineno] = self.line_numbers.popleft() r = (r, ) new.extend(r) # markdown parser surrounds child nodes with unwanted u"\n" children, here we drop trailing \n if hasattr( child, "tail") and child.tail is not None and child.tail != u'\n': new.append(postproc_text(self.markdown, child.tail)) return new # }}} def count_lines(self, text): """ Create a list of line numbers corresponding to the first line of each markdown block. The markdown parser does not provide text line numbers nor is there an easy way to add line numbers. As an alternative, we try to split the input text into the same blocks as the parser does, then calculate the starting line number of each block. The list will be processed by the do_children method above. This method has unresolved problems caused by splitting the text into blocks based upon the presence of 2 adjacent line end characters, including: * blank lines within lists create separate blocks * omitting a blank line after a heading combines 2 elements into one block * using more than one blank lines between blocks The net result is we either have too few or too many line numbers in the generated list which will cause the double-click-to-edit autoscroll textarea to sometimes be off by several lines. TODO: revisit this when the parsing errors documented in contrib/serialized/items.moin (markdown item) are fixed. """ line_numbers = deque() lineno = 1 in_blockquote = False blocks = text.split(u'\n\n') for block in blocks: if not block: # bump count because empty blocks will be discarded lineno += 2 continue line_count = block.count(u'\n') # detect and fix the problem of interspersed blank lines within blockquotes if block.startswith(u' ') or block.startswith(u'\n '): if in_blockquote: lineno += line_count + 2 continue in_blockquote = True else: in_blockquote = False if block.startswith(u'\n'): lineno += 1 line_numbers.append(lineno) lineno += line_count + 2 - 1 # -1 is already in count else: line_numbers.append(lineno) lineno += line_count + 2 self.line_numbers = line_numbers def embedded_markup(self, text): """ Per http://meta.stackexchange.com/questions/1777/what-html-tags-are-allowed-on-stack-exchange-sites markdown markup allows users to specify several "safe" HTML tags within a document. These tags include: a b blockquote code del dd dl dt em h1 h2 h3 i img kbd li ol p pre s sup sub strong strike ul br hr In addition, some markdown extensions output raw HTML tags (e.g. fenced outputs "<pre><code>..."). To prevent the <, > characters from being escaped, the embedded tags are converted to nodes by using the converter in html_in.py. """ try: # work around a possible bug - there is a traceback if HTML document has no tags p_text = html_in_converter(u'<p>%s</p>' % text) except AssertionError: # html_in converter (EmeraldTree) throws exceptions on markup style links: "Some text <http://moinmo.in> more text" p_text = text if not isinstance( p_text, unicode ) and p_text.tag == moin_page.page and p_text[ 0].tag == moin_page.body and p_text[0][0].tag == moin_page.p: # will fix possible problem of P node having block children later return p_text[0][0] return p_text def convert_embedded_markup(self, node): """ Recurse through tree looking for embedded markup. :param node: a tree node """ for idx, child in enumerate(node): if isinstance(child, unicode): if u'<' in child: node[idx] = self.embedded_markup( child ) # child is immutable string, so must do node[idx] else: # do not convert markup within a <pre> tag if not child.tag == moin_page.blockcode: self.convert_embedded_markup(child) def convert_invalid_p_nodes(self, node): """ Processing embedded HTML tags within markup or output from extensions with embedded markup can result in invalid HTML output caused by <p> tags enclosing a block element. The solution is to search for these occurances and change the <p> tag to a <div>. :param node: a tree node """ for child in node: if not isinstance(child, unicode): if child.tag == moin_page.p and len(child): for grandchild in child: if not isinstance( grandchild, unicode) and grandchild.tag in BLOCK_ELEMENTS: child.tag = moin_page.div self.convert_invalid_p_nodes(child) def __init__(self): self.markdown = Markdown(extensions=[ 'extra', 'toc', ]) @classmethod def _factory(cls, input, output, **kw): return cls() def __call__(self, data, contenttype=None, arguments=None): text = decode_data(data, contenttype) # {{{ stolen from Markdown.convert # Fixup the source text try: text = unicode(text) except UnicodeDecodeError, e: # Customise error message while maintaining original traceback e.reason += '. -- Note: Markdown only accepts unicode input!' raise text = text.replace(md_util.STX, "").replace(md_util.ETX, "") text = text.replace("\r\n", "\n").replace("\r", "\n") + "\n\n" text = text.expandtabs(self.markdown.tab_length) text = re.sub(r'(?<=\n) +\n', '\n', text) self.count_lines(text) # Split into lines and run the line preprocessors. lines = text.split("\n") for prep in self.markdown.preprocessors.values(): lines = prep.run(lines) # Parse the high-level elements. md_root = self.markdown.parser.parseDocument(lines).getroot() # Run the tree-processors for treeprocessor in self.markdown.treeprocessors.values(): new_md_root = treeprocessor.run(md_root) if new_md_root: md_root = new_md_root # }}} # md_root is a list of plain old Python ElementTree objects. add_lineno = bool(flaskg and flaskg.add_lineno_attr) converted = self.do_children(md_root, add_lineno=add_lineno) body = moin_page.body(children=converted) root = moin_page.page(children=[body]) self.convert_embedded_markup(root) self.convert_invalid_p_nodes(root) return root
def handle_nowiki(self, elem, page): """{{{* where * may be #!wiki, #!csv, #!highlight python, "", etc., or an invalid argument.""" logging.debug("handle_nowiki elem: %r" % elem) marker_len, all_nowiki_args, content = elem._children nowiki_args = all_nowiki_args[0].strip() # remove all the old children of the element, new children will be added elem.remove_all() if not nowiki_args: # input similar to: {{{\ntext\n}}}\n blockcode = moin_page.blockcode(children=(content, )) elem.append(blockcode) return if nowiki_args.startswith('#!') and len(nowiki_args) > 2: arguments = nowiki_args[2:].split(' ', 1) # skip leading #! nowiki_name = arguments[0] optional_args = arguments[1] if len(arguments) > 1 else None else: nowiki_name = optional_args = None lexer = None if nowiki_name in set(('diff', 'cplusplus', 'python', 'java', 'pascal', 'irc')): # make old style markup similar to {{{#!python like new style {{{#!highlight python optional_args = nowiki_name if not optional_args else nowiki_name + ' ' + optional_args nowiki_name = 'highlight' if nowiki_name == u'highlight': # TODO: support moin 1.9 options like numbers=on start=222 step=10 optional_args = optional_args.split()[0] # ignore all parameters except lexer name try: lexer = pygments.lexers.get_lexer_by_name(optional_args) except ClassNotFound: try: lexer = pygments.lexers.get_lexer_for_mimetype(optional_args) except ClassNotFound: self.invalid_args(elem, all_nowiki_args) lexer = pygments.lexers.get_lexer_by_name('text') if lexer: blockcode = moin_page.blockcode(attrib={moin_page.class_: 'highlight'}) pygments.highlight(content, lexer, TreeFormatter(), blockcode) elem.append(blockcode) return if nowiki_name in ('csv', 'text/csv'): # TODO: support moin 1.9 options: quotechar, show, hide, autofilter, name, link, static_cols, etc delim = None if optional_args: m = re.search('delimiter=(.?)', optional_args) if m and m.group(1): delim = m.group(1) if not delim: delim = optional_args.split()[0] # ignore all parameters except a delimiter in first position if len(delim) > 1: delim = None sep = delim or u';' content = content.split('\n') head = content[0].split(sep) rows = [x.split(sep) for x in content[1:]] csv_builder = TableMixin() table = csv_builder.build_dom_table(rows, head=head, cls='moin-csv-table moin-sortable') elem.append(table) return if nowiki_name in ('wiki', 'text/x.moin.wiki',): from .moinwiki_in import Converter as moinwiki_converter moinwiki = moinwiki_converter() lines = normalize_split_text(content) lines = _Iter(lines) # reparse arguments from original: {{{#!wiki solid/orange (style="color: red;") wiki_args = parse_arguments(all_nowiki_args[0][2:]) if len(wiki_args.positional) > 1: wiki_args.keyword['class'] = u' '.join(wiki_args.positional[1:]) del wiki_args.positional[:] body = moinwiki.parse_block(lines, wiki_args) page = moin_page.page(children=(body, )) elem.append(page) return if nowiki_name in ('creole', 'text/x.moin.creole'): from .creole_in import Converter as creole_converter creole = creole_converter() lines = normalize_split_text(content) lines = _Iter(lines) body = creole.parse_block(lines, optional_args) page = moin_page.page(children=(body, )) elem.append(page) return if nowiki_name in ('rst', 'text/x-rst'): from .rst_in import Converter as rst_converter rst = rst_converter() page = rst(content, contenttype=u'text/x-rst;charset=utf-8') elem.append(page) return if nowiki_name in ('docbook', 'application/docbook+xml'): from .docbook_in import Converter as docbook_converter docbook = docbook_converter() page = docbook(content, contenttype=u'application/docbook+xml;charset=utf-8') elem.append(page) return if nowiki_name in ('markdown', 'text/x-markdown'): from .markdown_in import Converter as markdown_converter markdown = markdown_converter() page = markdown(content, contenttype=u'text/x-markdown;charset=utf-8') elem.append(page) return if nowiki_name in ('mediawiki', 'text/x-mediawiki'): from .mediawiki_in import Converter as mediawiki_converter mediawiki = mediawiki_converter() page = mediawiki(content, optional_args) elem.append(page) return if nowiki_name in ('html', 'HTML', 'text/html'): from .html_in import Converter as html_converter html = html_converter() page = html(content, optional_args) elem.append(page) return self.invalid_args(elem, all_nowiki_args) lexer = pygments.lexers.get_lexer_by_name('text') blockcode = moin_page.blockcode(attrib={moin_page.class_: 'highlight'}) pygments.highlight(content, lexer, TreeFormatter(), blockcode) elem.append(blockcode) return
def __call__(self, data, contenttype=None, arguments=None): """ Convert markdown to moin DOM. data is a pointer to an open file (ProtectedRevision object) contenttype is likely == u'text/x-markdown;charset=utf-8' arguments is not used Markdown processing takes place in five steps: 1. A bunch of "preprocessors" munge the input text. 2. BlockParser() parses the high-level structural elements of the pre-processed text into an ElementTree. 3. A bunch of "treeprocessors" are run against the ElementTree. One such treeprocessor runs InlinePatterns against the ElementTree, detecting inline markup. 4. Some post-processors are run against the ElementTree nodes containing text and the ElementTree is converted to an EmeraldTree. 5. The root of the EmeraldTree is returned. """ # read the data from wiki storage and convert to unicode text = decode_data(data, contenttype) # Normalize whitespace for consistent parsing. - copied from NormalizeWhitespace in markdown/preprocessors.py text = text.replace(md_util.STX, "").replace(md_util.ETX, "") text = text.replace("\r\n", "\n").replace("\r", "\n") + "\n\n" text = text.expandtabs(self.markdown.tab_length) text = re.sub(r'(?<=\n) +\n', '\n', text) # save line counts for start of each block, used later for edit autoscroll self.count_lines(text) # {{{ stolen from Markdown.convert # Split into lines and run the line preprocessors. lines = text.split("\n") for prep in self.markdown.preprocessors.values(): lines = prep.run(lines) # Parse the high-level elements, md_root is an ElementTree object md_root = self.markdown.parser.parseDocument(lines).getroot() # Run the tree-processors for treeprocessor in self.markdown.treeprocessors.values(): new_md_root = treeprocessor.run(md_root) if new_md_root: md_root = new_md_root # }}} end stolen from Markdown.convert add_lineno = bool(flaskg and flaskg.add_lineno_attr) # run markdown post processors and convert from ElementTree to an EmeraldTree object converted = self.do_children(md_root, add_lineno=add_lineno) # convert html embedded in text strings to EmeraldTree nodes self.convert_embedded_markup(converted) # convert P-tags containing block elements to DIV-tags self.convert_invalid_p_nodes(converted) body = moin_page.body(children=converted) root = moin_page.page(children=[body]) return root