Example #1
0
 def __init__(self):
     self.current_node = moin_page.body()
     self.root = moin_page.page(children=(self.current_node, ))
     self.path = [self.root, self.current_node]
     self.header_size = 1
     self.status = ['document']
     self.footnotes = dict()
Example #2
0
    def __call__(self, data, contenttype=None, arguments=None):
        """
        Function called by the converter to process the
        conversion.

        TODO: Add support for different arguments
        """
        text = decode_data(data, contenttype)
        content = normalize_split_text(text)
        # Be sure we have empty string in the base url
        self.base_url = ''

        # We create an element tree from the HTML content
        # The content is a list of string, line per line
        # We can concatenate all in one string
        html_str = ''
        html_str = html_str.join(content)
        html_tree = HTML(html_str)

        # We should have a root element, which will be converted as <page>
        # for the DOM Tree. It can be <html> or <div>.
        # NB : If <html> used, it will be converted back to <div> after
        # one roundtrip
        if html_tree.tag.name != 'html' and html_tree.tag.name != 'div':
            html_str = ''.join(['<div>', html_str, '</div>'])
            html_tree = HTML(html_str)

        # Start the conversion of the first element
        # Every child of each element will be recursively convert too
        element = self.do_children(html_tree)

        # Add Global element to our DOM Tree
        body = moin_page.body(children=element)
        root = moin_page.page(children=[body])
        return root
Example #3
0
    def __call__(self, rev, contenttype=None, arguments=None):
        item_name = rev.item.name
        query_keys = {'do': 'get', 'rev': rev.revid}
        attrib = {}
        if arguments:
            query = arguments.keyword.get(xinclude.href)
            if query and query.query:
                # query.query value is similar to  "w=75" given a transclusion "{{jpeg||&w=75 class="top"}}"
                query_keys.update(url_decode(query.query))
            attrib = arguments.keyword

        query = url_encode(query_keys, charset=CHARSET, encode_keys=True)

        attrib.update({
            moin_page.type_:
            unicode(self.input_type),
            xlink.href:
            Iri(scheme='wiki', authority='', path='/' + item_name,
                query=query),
        })

        obj = moin_page.object_(attrib=attrib, children=[
            item_name,
        ])
        body = moin_page.body(children=(obj, ))
        return moin_page.page(children=(body, ))
Example #4
0
    def __call__(self, content, arguments=None):
        iter_content = _Iter(content)

        body = self.parse_block(iter_content, arguments)
        root = moin_page.page(children=[body])

        return root
Example #5
0
    def block_nowiki_repl(self, iter_content, stack, nowiki, nowiki_marker,
                          nowiki_interpret=None, nowiki_name=None, nowiki_args=None,
                          nowiki_args_old=None):
        stack.clear()

        nowiki_marker_len = len(nowiki_marker)

        lines = _Iter(self.block_nowiki_lines(iter_content, nowiki_marker_len), startno=iter_content.lineno)

        if nowiki_interpret:
            if nowiki_args:
                args = parse_arguments(nowiki_args)
            elif nowiki_args_old:
                args = Arguments(keyword={'_old': nowiki_args_old})
            else:
                args = None
            logging.debug("nowiki_name: %r" % nowiki_name)
            # Parse it directly if the type is ourself
            if not nowiki_name or nowiki_name == 'wiki':
                body = self.parse_block(lines, args)
                elem = moin_page.page(children=(body, ))
                stack.top_append(elem)
                return

            stack.top_append(self.parser(nowiki_name, args, lines))
            return

        elem = moin_page.blockcode()
        stack.top_append(elem)

        for line in lines:
            if len(elem):
                elem.append('\n')
            elem.append(line)
Example #6
0
    def block_nowiki_repl(self, iter_content, stack, nowiki, nowiki_marker,
                          nowiki_interpret=None, nowiki_name=None, nowiki_args=None,
                          nowiki_args_old=None):
        stack.clear()

        nowiki_marker_len = len(nowiki_marker)

        lines = _Iter(self.block_nowiki_lines(iter_content, nowiki_marker_len), startno=iter_content.lineno)

        if nowiki_interpret:
            if nowiki_args:
                args = parse_arguments(nowiki_args)
            elif nowiki_args_old:
                args = Arguments(keyword={'_old': nowiki_args_old})
            else:
                args = None
            logging.debug("nowiki_name: %r" % nowiki_name)
            # Parse it directly if the type is ourself
            if not nowiki_name or nowiki_name == 'wiki':
                body = self.parse_block(lines, args)
                elem = moin_page.page(children=(body, ))
                stack.top_append(elem)
                return

            stack.top_append(self.parser(nowiki_name, args, lines))
            return

        elem = moin_page.blockcode()
        stack.top_append(elem)

        for line in lines:
            if len(elem):
                elem.append('\n')
            elem.append(line)
Example #7
0
 def __call__(self, data, contenttype=None, arguments=None):
     text = decode_data(data, contenttype)
     content = normalize_split_text(text)
     # as of py 2.7.x (and in the year 2013), the csv module seems to still
     # have troubles with unicode, thus we encode to utf-8 ...
     content = [line.encode('utf-8') for line in content]
     dialect = csv.Sniffer().sniff(content[0])
     reader = csv.reader(content, dialect)
     # ... and decode back to unicode
     rows = []
     for encoded_row in reader:
         row = []
         for encoded_cell in encoded_row:
             row.append(encoded_cell.decode('utf-8'))
         if row:
             rows.append(row)
     head = None
     cls = None
     try:
         # fragile function throws errors when csv file is incorrectly formatted
         if csv.Sniffer().has_header('\n'.join(content)):
             head = rows[0]
             rows = rows[1:]
             cls = 'moin-sortable'
     except csv.Error as e:
         head = [_('Error parsing CSV file:'), str(e)]
     table = self.build_dom_table(rows, head=head, cls=cls)
     body = moin_page.body(children=(table, ))
     return moin_page.page(children=(body, ))
Example #8
0
 def __call__(self, content, arguments=None):
     content = u'\n'.join(content)
     blockcode = moin_page.blockcode(
         attrib={moin_page.class_: 'highlight'})
     pygments.highlight(content, self.lexer, TreeFormatter(), blockcode)
     body = moin_page.body(children=(blockcode, ))
     return moin_page.page(children=(body, ))
    def __call__(self, content, arguments=None):
        iter_content = _Iter(content)
        self.preprocessor = self.Mediawiki_preprocessor()
        body = self.parse_block(iter_content, arguments)
        root = moin_page.page(children=(body, ))

        return root
Example #10
0
 def __init__(self):
     self.current_node = moin_page.body()
     self.root = moin_page.page(children=(self.current_node, ))
     self.path = [self.root, self.current_node]
     self.header_size = 1
     self.status = ['document']
     self.footnotes = dict()
Example #11
0
    def __call__(self, data, contenttype=None, arguments=None):
        """
        Function called by the converter to process the
        conversion.

        TODO: Add support for different arguments
        """
        text = decode_data(data, contenttype)
        content = normalize_split_text(text)
        # Be sure we have empty string in the base url
        self.base_url = ''

        # We create an element tree from the HTML content
        # The content is a list of string, line per line
        # We can concatenate all in one string
        html_str = ''
        html_str = html_str.join(content)
        html_tree = HTML(html_str)

        # We should have a root element, which will be converted as <page>
        # for the DOM Tree. It can be <html> or <div>.
        # NB : If <html> used, it will be converted back to <div> after
        # one roundtrip
        if html_tree.tag.name != 'html' and html_tree.tag.name != 'div':
            html_str = ''.join(['<div>', html_str, '</div>'])
            html_tree = HTML(html_str)

        # Start the conversion of the first element
        # Every child of each element will be recursively convert too
        element = self.do_children(html_tree)

        # Add Global element to our DOM Tree
        body = moin_page.body(children=element)
        root = moin_page.page(children=[body])
        return root
Example #12
0
 def __call__(self, data, contenttype=None, arguments=None):
     text = decode_data(data, contenttype)
     content = normalize_split_text(text)
     # as of py 2.7.x (and in the year 2013), the csv module seems to still
     # have troubles with unicode, thus we encode to utf-8 ...
     content = [line.encode('utf-8') for line in content]
     dialect = csv.Sniffer().sniff(content[0])
     reader = csv.reader(content, dialect)
     # ... and decode back to unicode
     rows = []
     for encoded_row in reader:
         row = []
         for encoded_cell in encoded_row:
             row.append(encoded_cell.decode('utf-8'))
         if row:
             rows.append(row)
     head = None
     cls = None
     try:
         # fragile function throws errors when csv file is incorrectly formatted
         if csv.Sniffer().has_header('\n'.join(content)):
             head = rows[0]
             rows = rows[1:]
             cls = 'moin-sortable'
     except csv.Error as e:
         head = [_('Error parsing CSV file:'), str(e)]
     table = self.build_dom_table(rows, head=head, cls=cls)
     body = moin_page.body(children=(table, ))
     return moin_page.page(children=(body, ))
Example #13
0
 def __call__(self, data, contenttype=None, arguments=None):
     text = decode_data(data, contenttype)
     content = normalize_split_text(text)
     content = u'\n'.join(content)
     blockcode = moin_page.blockcode(attrib={moin_page.class_: 'highlight'})
     pygments.highlight(content, self.lexer, TreeFormatter(), blockcode)
     body = moin_page.body(children=(blockcode, ))
     return moin_page.page(children=(body, ))
Example #14
0
 def __call__(self, rev, contenttype=None, arguments=None):
     item_name = rev.item.fqname.value
     attrib = {
         xlink.href: Iri(scheme='wiki', authority='', path='/' + item_name, query='do=modify'),
     }
     a = moin_page.a(attrib=attrib, children=[_("%(item_name)s does not exist. Create it?", item_name=item_name)])
     body = moin_page.body(children=(a, ))
     return moin_page.page(children=(body, ))
Example #15
0
 def __call__(self, data, contenttype=None, arguments=None):
     text = decode_data(data, contenttype)
     content = normalize_split_text(text)
     content = u'\n'.join(content)
     blockcode = moin_page.blockcode(attrib={moin_page.class_: 'highlight'})
     pygments.highlight(content, self.lexer, TreeFormatter(), blockcode)
     body = moin_page.body(children=(blockcode, ))
     return moin_page.page(children=(body, ))
Example #16
0
 def __call__(self, content, arguments=None):
     """Parse the text and return DOM tree."""
     blockcode = moin_page.blockcode()
     for line in content:
         if len(blockcode):
             blockcode.append('\n')
         blockcode.append(line.expandtabs())
     body = moin_page.body(children=(blockcode, ))
     return moin_page.page(children=(body, ))
Example #17
0
 def __call__(self, rev, contenttype=None, arguments=None):
     item_name = rev.item.name
     attrib = {
         xlink.href: Iri(scheme='wiki', authority='', path='/' + item_name,
                         query='do=get&rev={0}'.format(rev.revid)),
     }
     a = moin_page.a(attrib=attrib, children=[u"Download {0}.".format(item_name)])
     body = moin_page.body(children=(a, ))
     return moin_page.page(children=(body, ))
Example #18
0
    def __call__(self, data, contenttype=None, arguments=None):
        text = decode_data(data, contenttype)
        lines = normalize_split_text(text)
        iter_content = _Iter(lines)

        body = self.parse_block(iter_content, arguments)
        root = moin_page.page(children=[body])

        return root
Example #19
0
 def __call__(self, content, arguments=None):
     """Parse the text and return DOM tree."""
     blockcode = moin_page.blockcode()
     for line in content:
         if len(blockcode):
             blockcode.append('\n')
         blockcode.append(line.expandtabs())
     body = moin_page.body(children=(blockcode, ))
     return moin_page.page(children=(body, ))
Example #20
0
    def __call__(self, data, contenttype=None, arguments=None):
        text = decode_data(data, contenttype)
        lines = normalize_split_text(text)
        iter_content = _Iter(lines)

        body = self.parse_block(iter_content, arguments)
        root = moin_page.page(children=(body, ))

        return root
Example #21
0
    def __call__(self, data, contenttype=None, arguments=None):
        text = decode_data(data, contenttype)
        content = normalize_split_text(text)
        iter_content = _Iter(content)
        self.preprocessor = self.Mediawiki_preprocessor()
        body = self.parse_block(iter_content, arguments)
        root = moin_page.page(children=(body, ))

        return root
Example #22
0
 def __call__(self, data, contenttype=None, arguments=None):
     text = decode_data(data, contenttype)
     content = normalize_split_text(text)
     blockcode = moin_page.blockcode()
     for line in content:
         if len(blockcode):
             blockcode.append('\n')
         blockcode.append(line.expandtabs())
     body = moin_page.body(children=(blockcode, ))
     return moin_page.page(children=(body, ))
Example #23
0
 def __call__(self, data, contenttype=None, arguments=None):
     text = decode_data(data, contenttype)
     content = normalize_split_text(text)
     blockcode = moin_page.blockcode()
     for line in content:
         if len(blockcode):
             blockcode.append('\n')
         blockcode.append(line.expandtabs())
     body = moin_page.body(children=(blockcode, ))
     return moin_page.page(children=(body, ))
Example #24
0
 def __call__(self, rev, contenttype=None, arguments=None):
     item_name = rev.item.name
     attrib = {
         moin_page.type_: unicode(self.input_type),
         xlink.href: Iri(scheme='wiki', authority='', path='/' + item_name,
                         query='do=get&rev={0}'.format(rev.revid)),
     }
     obj = moin_page.object_(attrib=attrib, children=[u'Your Browser does not support HTML5 audio/video element.', ])
     body = moin_page.body(children=(obj, ))
     return moin_page.page(children=(body, ))
Example #25
0
 def __call__(self, rev, contenttype=None, arguments=None):
     item_name = rev.item.name
     attrib = {
         moin_page.type_: unicode(self.input_type),
         xlink.href: Iri(
             scheme="wiki", authority="", path="/" + item_name, query="do=get&rev={0}".format(rev.revid)
         ),
     }
     obj = moin_page.object_(attrib=attrib, children=[item_name])
     body = moin_page.body(children=(obj,))
     return moin_page.page(children=(body,))
Example #26
0
    def __call__(self, data, contenttype=None, arguments=None):
        """
        Function called by the converter to process the
        conversion.

        TODO: Add support for different arguments
        """
        text = decode_data(data, contenttype)
        # data cleanup is not needed by html_out, but is needed by moinwiki_out; CKEditor adds unwanted \n\t
        while '\t\t' in text:
            text = text.replace('\t\t', '\t')
        text = text.replace('\r\n\t', '').replace('\n\t', '')

        content = normalize_split_text(text)
        # Be sure we have empty string in the base url
        self.base_url = ''

        # We create an element tree from the HTML content
        # The content is a list of string, line per line
        # We can concatenate all in one string
        html_str = u'\n'.join(content)
        try:
            html_tree = HTML(html_str)
        except AssertionError as reason:
            # we suspect user has created or uploaded malformed HTML, try to show input as preformatted code
            msg = _('Error: malformed HTML: {reason}.').format(reason=reason)
            msg = '<div class="error"><p><strong>%s</strong></p></div>' % msg
            html_str = ''.join(['<html>', msg, '<pre>', html_str, '</pre></html>'])
            try:
                html_tree = HTML(html_str)
            except ValueError:
                msg = _('Error: malformed HTML. Try viewing source with Highlight or Modify links.')
                msg = '<div class="error"><p><strong>%s</strong></p></div>' % msg
                html_str = ''.join(['<html>', msg, '</html>'])
                html_tree = HTML(html_str)

        # We should have a root element, which will be converted as <page>
        # for the DOM Tree.
        # NB : If <html> used, it will be converted back to <div> after
        # one roundtrip
        if html_tree.tag.name != 'html':
            html_str = ''.join(['<div>', html_str, '</div>'])
            html_tree = HTML(html_str)

        # Start the conversion of the first element
        # Every child of each element will be recursively convert too
        element = self.do_children(html_tree)

        # Add Global element to our DOM Tree
        body = moin_page.body(children=element)
        root = moin_page.page(children=[body])
        return root
Example #27
0
 def __call__(self, rev, contenttype=None, arguments=None):
     item_name = rev.item.name
     attrib = {
         xlink.href:
         Iri(scheme='wiki',
             authority='',
             path='/' + item_name,
             query='do=get&rev={0}'.format(rev.revid)),
     }
     a = moin_page.a(attrib=attrib,
                     children=[u"Download {0}.".format(item_name)])
     body = moin_page.body(children=(a, ))
     return moin_page.page(children=(body, ))
Example #28
0
 def __call__(self, rev, contenttype=None, arguments=None):
     self.item_name = rev.item.name
     try:
         contents = self.list_contents(rev.data)
         contents = [(self.process_size(size),
                      self.process_datetime(dt),
                      self.process_name(name),
         ) for size, dt, name in contents]
         table = self.build_dom_table(contents, head=[_("Size"), _("Timestamp"), _("Name")], cls='zebra')
         body = moin_page.body(children=(table, ))
         return moin_page.page(children=(body, ))
     except ArchiveException as err:
         logging.exception("An exception within archive file handling occurred:")
         # XXX we also use a table for error reporting, could be
         # something more adequate, though:
         return self.build_dom_table([[str(err)]])
Example #29
0
 def __call__(self, rev, contenttype=None, arguments=None):
     item_name = rev.item.name
     attrib = {
         moin_page.type_:
         unicode(self.input_type),
         xlink.href:
         Iri(scheme='wiki',
             authority='',
             path='/' + item_name,
             query='do=get&rev={0}'.format(rev.revid)),
     }
     obj = moin_page.object_(
         attrib=attrib,
         children=[
             u'Your Browser does not support HTML5 audio/video element.',
         ])
     body = moin_page.body(children=(obj, ))
     return moin_page.page(children=(body, ))
Example #30
0
    def block_nowiki_repl(self, iter_content, stack, nowiki):
        """Handles a complete nowiki block"""

        stack.clear()

        try:
            firstline = iter_content.next()
        except StopIteration:
            stack.push(moin_page.blockcode())
            return

        # Stop directly if we got an end marker in the first line
        match = self.nowiki_end_re.match(firstline)
        if match and not match.group('escape'):
            stack.push(moin_page.blockcode())
            return

        lines = _Iter(self.block_nowiki_lines(iter_content),
                      startno=iter_content.lineno)

        match = self.nowiki_interpret_re.match(firstline)

        if match:
            name = match.group('nowiki_name')
            args = match.group('nowiki_args')
            if args:
                args = parse_arguments(args)

            # Parse it directly if the type is ourself
            if not name or name == 'creole':
                body = self.parse_block(lines, args)
                elem = moin_page.page(children=(body, ))
                stack.top_append(elem)

            else:
                stack.top_append(self.parser(name, args, lines))

        else:
            elem = moin_page.blockcode(children=(firstline, ))
            stack.top_append(elem)

            for line in lines:
                elem.append('\n')
                elem.append(line)
Example #31
0
    def block_nowiki_repl(self, iter_content, stack, nowiki):
        """Handles a complete nowiki block"""

        stack.clear()

        try:
            firstline = iter_content.next()
        except StopIteration:
            stack.push(moin_page.blockcode())
            return

        # Stop directly if we got an end marker in the first line
        match = self.nowiki_end_re.match(firstline)
        if match and not match.group("escape"):
            stack.push(moin_page.blockcode())
            return

        lines = _Iter(self.block_nowiki_lines(iter_content), startno=iter_content.lineno)

        match = self.nowiki_interpret_re.match(firstline)

        if match:
            name = match.group("nowiki_name")
            args = match.group("nowiki_args")
            if args:
                args = parse_arguments(args)

            # Parse it directly if the type is ourself
            if not name or name == "creole":
                body = self.parse_block(lines, args)
                elem = moin_page.page(children=(body,))
                stack.top_append(elem)

            else:
                stack.top_append(self.parser(name, args, lines))

        else:
            elem = moin_page.blockcode(children=(firstline,))
            stack.top_append(elem)

            for line in lines:
                elem.append("\n")
                elem.append(line)
Example #32
0
 def __call__(self, data, contenttype=None, arguments=None):
     text = decode_data(data, contenttype)
     content = normalize_split_text(text)
     # as of py 2.7.x (and in the year 2013), the csv module seems to still
     # have troubles with unicode, thus we encode to utf-8 ...
     content = [line.encode('utf-8') for line in content]
     dialect = csv.Sniffer().sniff(content[0])
     reader = csv.reader(content, dialect)
     # ... and decode back to unicode
     rows = []
     for encoded_row in reader:
         row = []
         for encoded_cell in encoded_row:
             row.append(encoded_cell.decode('utf-8'))
         if row:
             rows.append(row)
     table = self.build_dom_table(rows)
     body = moin_page.body(children=(table, ))
     return moin_page.page(children=(body, ))
Example #33
0
    def __call__(self, rev, contenttype=None, arguments=None):
        item_name = rev.item.name
        query_keys = {'do': 'get', 'rev': rev.revid}
        attrib = {}
        if arguments:
            query = arguments.keyword.get(xinclude.href).query
            if query:
                query_keys.update(url_decode(query))
            attrib = arguments.keyword

        query = url_encode(query_keys, charset=CHARSET, encode_keys=True)

        attrib.update({
            moin_page.type_: unicode(self.input_type),
            xlink.href: Iri(scheme='wiki', authority='', path='/' + item_name,
                            query=query),
        })

        obj = moin_page.object_(attrib=attrib, children=[item_name, ])
        body = moin_page.body(children=(obj, ))
        return moin_page.page(children=(body, ))
Example #34
0
 def __call__(self, rev, contenttype=None, arguments=None):
     self.item_name = rev.item.name
     try:
         contents = self.list_contents(rev.data)
         contents = [(
             self.process_size(size),
             self.process_datetime(dt),
             self.process_name(name),
         ) for size, dt, name in contents]
         table = self.build_dom_table(
             contents,
             head=[_("Size"), _("Timestamp"),
                   _("Name")],
             cls='zebra')
         body = moin_page.body(children=(table, ))
         return moin_page.page(children=(body, ))
     except ArchiveException as err:
         logging.exception(
             "An exception within archive file handling occurred:")
         # XXX we also use a table for error reporting, could be
         # something more adequate, though:
         return self.build_dom_table([[str(err)]])
Example #35
0
class Converter(object):
    # {{{ html conversion

    # HTML tags which can be converted directly to the moin_page namespace
    symmetric_tags = set(['div', 'p', 'strong', 'code', 'quote', 'blockquote'])

    # HTML tags to define a list, except dl which is a little bit different
    list_tags = set(['ul', 'ol'])

    # HTML tags which can be convert without attributes in a different DOM tag
    simple_tags = {  # Emphasis
        'em': moin_page.emphasis,
        'i': moin_page.emphasis,
        # Strong
        'b': moin_page.strong,
        'strong': moin_page.strong,
        # Code and Blockcode
        'pre': moin_page.blockcode,
        'tt': moin_page.code,
        'samp': moin_page.code,
        # Lists
        'dl': moin_page.list_item,
        'dt': moin_page.list_item_label,
        'dd': moin_page.list_item_body,
        # Table - th and td require special processing for alignment of cell contents
        'table': moin_page.table,
        'thead': moin_page.table_header,
        'tbody': moin_page.table_body,
        'tr': moin_page.table_row,
    }

    # HTML Tag which does not have equivalence in the DOM Tree
    # But we keep the information using <span element>
    inline_tags = set(['abbr', 'acronym', 'address', 'dfn', 'kbd'])

    # HTML tags which are completely ignored by our converter.
    # We even do not process children of these elements.
    ignored_tags = set([
        'applet',
        'area',
        'button',
        'caption',
        'center',
        'fieldset',
        'form',
        'frame',
        'frameset',
        'head',
        'iframe',
        'input',
        'isindex',
        'label',
        'legend',
        'link',
        'map',
        'menu',
        'noframes',
        'noscript',
        'optgroup',
        'option',
        'param',
        'script',
        'select',
        'style',
        'textarea',
        'title',
        'var',
    ])

    # standard_attributes are html attributes which are used
    # directly in the DOM tree, without any conversion
    standard_attributes = set(['title', 'class', 'style'])

    # Regular expression to detect an html heading tag
    heading_re = re.compile('h[1-6]')

    def new(self, tag, attrib, children):
        """
        Return a new element for the DOM Tree
        """
        return ET.Element(tag, attrib=attrib, children=children)

    def new_copy(self, tag, element, attrib):
        """
        Function to copy one element to the DOM Tree.

        It first converts the child of the element,
        and the element itself.
        """
        attrib_new = self.convert_attributes(element)
        attrib.update(attrib_new)
        children = self.do_children(element)
        return self.new(tag, attrib, children)

    def new_copy_symmetric(self, element, attrib):
        """
        Create a new QName, with the same tag of the element,
        but with a different namespace.

        Then, we handle the copy normally.
        """
        tag = ET.QName(element.tag, moin_page)
        return self.new_copy(tag, element, attrib)

    def convert_attributes(self, element):
        result = {}
        for key, value in element.attrib.iteritems():
            if key in self.standard_attributes:
                result[html(key)] = value
            if key == 'id':
                result[xml('id')] = value
        return result

    def visit_heading(self, element):
        """
        Function to convert an heading tag into a proper
        element in our moin_page namespace
        """
        heading_level = element.tag[1]
        key = moin_page('outline-level')
        attrib = {}
        attrib[key] = heading_level
        return self.new_copy(moin_page.h, element, attrib)

    def visit_br(self, element):
        return moin_page.line_break()

    def visit_big(self, element):
        key = moin_page('font-size')
        attrib = {}
        attrib[key] = '120%'
        return self.new_copy(moin_page.span, element, attrib)

    def visit_small(self, element):
        key = moin_page('font-size')
        attrib = {}
        attrib[key] = '85%'
        return self.new_copy(moin_page.span, element, attrib)

    def visit_sub(self, element):
        key = moin_page('baseline-shift')
        attrib = {}
        attrib[key] = 'sub'
        return self.new_copy(moin_page.span, element, attrib)

    def visit_sup(self, element):
        key = moin_page('baseline-shift')
        attrib = {}
        attrib[key] = 'super'
        return self.new_copy(moin_page.span, element, attrib)

    def visit_u(self, element):
        key = moin_page('text-decoration')
        attrib = {}
        attrib[key] = 'underline'
        return self.new_copy(moin_page.span, element, attrib)

    def visit_ins(self, element):
        key = moin_page('text-decoration')
        attrib = {}
        attrib[key] = 'underline'
        return self.new_copy(moin_page.span, element, attrib)

    def visit_del(self, element):
        key = moin_page('text-decoration')
        attrib = {}
        attrib[key] = 'line-through'
        return self.new_copy(moin_page.span, element, attrib)

    def visit_s(self, element):
        key = moin_page('text-decoration')
        attrib = {}
        attrib[key] = 'line-through'
        return self.new_copy(moin_page.span, element, attrib)

    def visit_strike(self, element):
        key = moin_page('text-decoration')
        attrib = {}
        attrib[key] = 'line-through'
        return self.new_copy(moin_page.span, element, attrib)

    def visit_hr(self, element, default_class=u'moin-hr3'):
        return self.new_copy(moin_page.separator, element,
                             {moin_page.class_: default_class})

    def visit_img(self, element):
        """
        <img src="URI" /> --> <object xlink:href="URI />
        """
        attrib = {}
        url = Iri(element.attrib.get('src'))
        if element.attrib.get('alt'):
            attrib[html.alt] = element.attrib.get('alt')
        if url.scheme is None:
            # img tag
            target = Iri(scheme='wiki.local',
                         path=element.attrib.get("src"),
                         fragment=None)
            attrib[xinclude.href] = target
            new_node = xinclude.include(attrib=attrib)
        else:
            # object tag
            attrib[xlink.href] = url
            new_node = moin_page.object(attrib)
        return new_node

    def visit_object(self, element):
        """
        <object data="href"></object> --> <object xlink="href" />
        """
        key = xlink('href')
        attrib = {}
        if self.base_url:
            attrib[key] = ''.join([self.base_url, element.get(html.data)])
        else:
            attrib[key] = element.get(html.data)

        # Convert the href attribute into unicode
        attrib[key] = unicode(attrib[key])
        return moin_page.object(attrib)

    def visit_inline(self, element):
        """
        For some specific inline tags (defined in inline_tags)
        We just return <span element="tag.name">
        """
        key = html.class_
        attrib = {}
        attrib[key] = ''.join(['html-', element.tag.name])
        return self.new_copy(moin_page.span, element, attrib)

    def visit_li(self, element):
        """
        NB : A list item (<li>) is like the following snippet::

            <list-item>
                <list-item-label>label</list-item-label>
                <list-item-body>Body</list-item-body>
            </list-item>

        For <li> element, there is no label
        """
        list_item_body = ET.Element(moin_page.list_item_body,
                                    attrib={},
                                    children=self.do_children(element))
        return ET.Element(moin_page.list_item,
                          attrib={},
                          children=[list_item_body])

    def visit_list(self, element):
        """
        Convert a list of item (whatever the type : ordered or unordered)
        So we have html code like::

            <ul>
                <li>Item 1</li>
                <li>Item 2</li>
            </ul>

        Which will be converted to::

            <list>
                <list-item>
                    <list-item-body>Item 1</list-item-body>
                </list-item>
                <list-item>
                    <list-item-body>Item 2</list-item-body>
                </list-item>
            </list>
        """
        # We will define the appropriate attribute
        # according to the type of the list
        attrib = {}
        if element.tag == "ul" or element.tag == "dir":
            attrib[moin_page('item-label-generate')] = 'unordered'
        elif element.tag == "ol":
            attrib[moin_page('item-label-generate')] = 'ordered'

        return ET.Element(moin_page.list,
                          attrib=attrib,
                          children=self.do_children(element))

    def visit_a(self, element):
        key = xlink('href')
        attrib = {}
        href = postproc_text(self.markdown, element.attrib.get("href"))
        if allowed_uri_scheme(href):
            attrib[key] = href
        else:
            return href
        return self.new_copy(moin_page.a, element, attrib)

    def convert_align_to_class(self, attrib):
        attr = {}
        alignment = attrib.get('align')
        if alignment in (u'right', u'center', u'left'):
            attr[moin_page.class_] = alignment
        return attr

    def visit_th(self, element):
        attrib = self.convert_align_to_class(element.attrib)
        return self.new_copy(html.th, element, attrib=attrib)

    def visit_td(self, element):
        attrib = self.convert_align_to_class(element.attrib)
        return self.new_copy(html.td, element, attrib=attrib)

    def visit(self, element):
        # Our element can be converted directly, just by changing the namespace
        if element.tag in self.symmetric_tags:
            return self.new_copy_symmetric(element, attrib={})

        # Our element is enough simple to just change the tag name
        if element.tag in self.simple_tags:
            return self.new_copy(self.simple_tags[element.tag],
                                 element,
                                 attrib={})

        # Our element defines a list
        if element.tag in self.list_tags:
            return self.visit_list(element)

        # We convert our element as a span tag with element attribute
        if element.tag in self.inline_tags:
            return self.visit_inline(element)

        # We have a heading tag
        if self.heading_re.match(element.tag):
            return self.visit_heading(element)

        # Otherwise we need a specific procedure to handle it
        method_name = 'visit_' + element.tag
        method = getattr(self, method_name, None)
        if method:
            return method(element)

        # We should ignore this tag
        if element.tag in self.ignored_tags:
            logging.info("INFO : Ignored tag : {0}".format(element.tag))
            return

        logging.info("INFO : Unhandled tag : {0}".format(element.tag))
        return

    def do_children(self, element, add_lineno=False):
        new = []
        # markdown parser surrounds child nodes with unwanted u"\n" children, here we remove leading \n
        if hasattr(
                element,
                "text") and element.text is not None and element.text != u'\n':
            new.append(postproc_text(self.markdown, element.text))

        for child in element:
            r = self.visit(child)
            if r is None:
                r = ()
            elif not isinstance(r, (list, tuple)):
                if add_lineno and self.line_numbers:
                    r.attrib[html.data_lineno] = self.line_numbers.popleft()
                r = (r, )
            new.extend(r)
            # markdown parser surrounds child nodes with unwanted u"\n" children, here we drop trailing \n
            if hasattr(
                    child,
                    "tail") and child.tail is not None and child.tail != u'\n':
                new.append(postproc_text(self.markdown, child.tail))
        return new

    # }}}

    def count_lines(self, text):
        """
        Create a list of line numbers corresponding to the first line of each markdown block.

        The markdown parser does not provide text line numbers nor is there an easy way to
        add line numbers. As an alternative, we try to split the input text into the same blocks
        as the parser does, then calculate the starting line number of each block.  The list will be
        processed by the do_children method above.

        This method has unresolved problems caused by splitting the text into blocks based upon
        the presence of 2 adjacent line end characters, including:

            * blank lines within lists create separate blocks
            * omitting a blank line after a heading combines 2 elements into one block
            * using more than one blank lines between blocks

        The net result is we either have too few or too many line numbers in the generated list which
        will cause the double-click-to-edit autoscroll textarea to sometimes be off by several lines.

        TODO: revisit this when the parsing errors documented in contrib/serialized/items.moin
        (markdown item) are fixed.
        """
        line_numbers = deque()
        lineno = 1
        in_blockquote = False
        blocks = text.split(u'\n\n')
        for block in blocks:
            if not block:
                # bump count because empty blocks will be discarded
                lineno += 2
                continue
            line_count = block.count(u'\n')

            # detect and fix the problem of interspersed blank lines within blockquotes
            if block.startswith(u'    ') or block.startswith(u'\n    '):
                if in_blockquote:
                    lineno += line_count + 2
                    continue
                in_blockquote = True
            else:
                in_blockquote = False

            if block.startswith(u'\n'):
                lineno += 1
                line_numbers.append(lineno)
                lineno += line_count + 2 - 1  # -1 is already in count
            else:
                line_numbers.append(lineno)
                lineno += line_count + 2
        self.line_numbers = line_numbers

    def embedded_markup(self, text):
        """
        Per http://meta.stackexchange.com/questions/1777/what-html-tags-are-allowed-on-stack-exchange-sites
        markdown markup allows users to specify several "safe" HTML tags within a document. These tags include:

            a b blockquote code del dd dl dt em h1 h2 h3 i img kbd li ol p pre s sup sub strong strike ul br hr

        In addition, some markdown extensions output raw HTML tags (e.g. fenced outputs "<pre><code>...").
        To prevent the <, > characters from being escaped, the embedded tags are converted to nodes by using
        the converter in html_in.py.
        """
        try:
            # work around a possible bug - there is a traceback if HTML document has no tags
            p_text = html_in_converter(u'<p>%s</p>' % text)
        except AssertionError:
            # html_in converter (EmeraldTree) throws exceptions on markup style links: "Some text <http://moinmo.in> more text"
            p_text = text

        if not isinstance(
                p_text, unicode
        ) and p_text.tag == moin_page.page and p_text[
                0].tag == moin_page.body and p_text[0][0].tag == moin_page.p:
            # will fix possible problem of P node having block children later
            return p_text[0][0]
        return p_text

    def convert_embedded_markup(self, node):
        """
        Recurse through tree looking for embedded markup.

        :param node: a tree node
        """
        for idx, child in enumerate(node):
            if isinstance(child, unicode):
                if u'<' in child:
                    node[idx] = self.embedded_markup(
                        child
                    )  # child is immutable string, so must do node[idx]
            else:
                # do not convert markup within a <pre> tag
                if not child.tag == moin_page.blockcode:
                    self.convert_embedded_markup(child)

    def convert_invalid_p_nodes(self, node):
        """
        Processing embedded HTML tags within markup or output from extensions with embedded markup can
        result in invalid HTML output caused by <p> tags enclosing a block element.

        The solution is to search for these occurances and change the <p> tag to a <div>.

        :param node: a tree node
        """
        for child in node:
            if not isinstance(child, unicode):
                if child.tag == moin_page.p and len(child):
                    for grandchild in child:
                        if not isinstance(
                                grandchild,
                                unicode) and grandchild.tag in BLOCK_ELEMENTS:
                            child.tag = moin_page.div
                self.convert_invalid_p_nodes(child)

    def __init__(self):
        self.markdown = Markdown(extensions=[
            'extra',
            'toc',
        ])

    @classmethod
    def _factory(cls, input, output, **kw):
        return cls()

    def __call__(self, data, contenttype=None, arguments=None):
        text = decode_data(data, contenttype)

        # {{{ stolen from Markdown.convert

        # Fixup the source text
        try:
            text = unicode(text)
        except UnicodeDecodeError, e:
            # Customise error message while maintaining original traceback
            e.reason += '. -- Note: Markdown only accepts unicode input!'
            raise

        text = text.replace(md_util.STX, "").replace(md_util.ETX, "")
        text = text.replace("\r\n", "\n").replace("\r", "\n") + "\n\n"
        text = text.expandtabs(self.markdown.tab_length)
        text = re.sub(r'(?<=\n) +\n', '\n', text)
        self.count_lines(text)

        # Split into lines and run the line preprocessors.
        lines = text.split("\n")
        for prep in self.markdown.preprocessors.values():
            lines = prep.run(lines)

        # Parse the high-level elements.
        md_root = self.markdown.parser.parseDocument(lines).getroot()

        # Run the tree-processors
        for treeprocessor in self.markdown.treeprocessors.values():
            new_md_root = treeprocessor.run(md_root)
            if new_md_root:
                md_root = new_md_root

        # }}}

        # md_root is a list of plain old Python ElementTree objects.

        add_lineno = bool(flaskg and flaskg.add_lineno_attr)
        converted = self.do_children(md_root, add_lineno=add_lineno)
        body = moin_page.body(children=converted)
        root = moin_page.page(children=[body])
        self.convert_embedded_markup(root)
        self.convert_invalid_p_nodes(root)

        return root
Example #36
0
    def handle_nowiki(self, elem, page):
        """{{{* where * may be #!wiki, #!csv, #!highlight python, "", etc., or an invalid argument."""
        logging.debug("handle_nowiki elem: %r" % elem)
        marker_len, all_nowiki_args, content = elem._children
        nowiki_args = all_nowiki_args[0].strip()

        # remove all the old children of the element, new children will be added
        elem.remove_all()

        if not nowiki_args:
            # input similar to: {{{\ntext\n}}}\n
            blockcode = moin_page.blockcode(children=(content, ))
            elem.append(blockcode)
            return

        if nowiki_args.startswith('#!') and len(nowiki_args) > 2:
            arguments = nowiki_args[2:].split(' ', 1)  # skip leading #!
            nowiki_name = arguments[0]
            optional_args = arguments[1] if len(arguments) > 1 else None
        else:
            nowiki_name = optional_args = None

        lexer = None
        if nowiki_name in set(('diff', 'cplusplus', 'python', 'java', 'pascal', 'irc')):
            # make old style markup similar to {{{#!python like new style {{{#!highlight python
            optional_args = nowiki_name if not optional_args else nowiki_name + ' ' + optional_args
            nowiki_name = 'highlight'

        if nowiki_name == u'highlight':
            # TODO: support moin 1.9 options like numbers=on start=222 step=10
            optional_args = optional_args.split()[0]  # ignore all parameters except lexer name
            try:
                lexer = pygments.lexers.get_lexer_by_name(optional_args)
            except ClassNotFound:
                try:
                    lexer = pygments.lexers.get_lexer_for_mimetype(optional_args)
                except ClassNotFound:
                    self.invalid_args(elem, all_nowiki_args)
                    lexer = pygments.lexers.get_lexer_by_name('text')
        if lexer:
            blockcode = moin_page.blockcode(attrib={moin_page.class_: 'highlight'})
            pygments.highlight(content, lexer, TreeFormatter(), blockcode)
            elem.append(blockcode)
            return

        if nowiki_name in ('csv', 'text/csv'):
            # TODO: support moin 1.9 options: quotechar, show, hide, autofilter, name, link, static_cols, etc
            delim = None
            if optional_args:
                m = re.search('delimiter=(.?)', optional_args)
                if m and m.group(1):
                    delim = m.group(1)
                if not delim:
                    delim = optional_args.split()[0]  # ignore all parameters except a delimiter in first position
                    if len(delim) > 1:
                        delim = None
            sep = delim or u';'
            content = content.split('\n')
            head = content[0].split(sep)
            rows = [x.split(sep) for x in content[1:]]
            csv_builder = TableMixin()
            table = csv_builder.build_dom_table(rows, head=head, cls='moin-csv-table moin-sortable')
            elem.append(table)
            return

        if nowiki_name in ('wiki', 'text/x.moin.wiki',):
            from .moinwiki_in import Converter as moinwiki_converter
            moinwiki = moinwiki_converter()
            lines = normalize_split_text(content)
            lines = _Iter(lines)
            # reparse arguments from original: {{{#!wiki solid/orange (style="color: red;")
            wiki_args = parse_arguments(all_nowiki_args[0][2:])
            if len(wiki_args.positional) > 1:
                wiki_args.keyword['class'] = u' '.join(wiki_args.positional[1:])
            del wiki_args.positional[:]
            body = moinwiki.parse_block(lines, wiki_args)
            page = moin_page.page(children=(body, ))
            elem.append(page)
            return

        if nowiki_name in ('creole', 'text/x.moin.creole'):
            from .creole_in import Converter as creole_converter
            creole = creole_converter()
            lines = normalize_split_text(content)
            lines = _Iter(lines)
            body = creole.parse_block(lines, optional_args)
            page = moin_page.page(children=(body, ))
            elem.append(page)
            return

        if nowiki_name in ('rst', 'text/x-rst'):
            from .rst_in import Converter as rst_converter
            rst = rst_converter()
            page = rst(content, contenttype=u'text/x-rst;charset=utf-8')
            elem.append(page)
            return

        if nowiki_name in ('docbook', 'application/docbook+xml'):
            from .docbook_in import Converter as docbook_converter
            docbook = docbook_converter()
            page = docbook(content, contenttype=u'application/docbook+xml;charset=utf-8')
            elem.append(page)
            return

        if nowiki_name in ('markdown', 'text/x-markdown'):
            from .markdown_in import Converter as markdown_converter
            markdown = markdown_converter()
            page = markdown(content, contenttype=u'text/x-markdown;charset=utf-8')
            elem.append(page)
            return

        if nowiki_name in ('mediawiki', 'text/x-mediawiki'):
            from .mediawiki_in import Converter as mediawiki_converter
            mediawiki = mediawiki_converter()
            page = mediawiki(content, optional_args)
            elem.append(page)
            return

        if nowiki_name in ('html', 'HTML', 'text/html'):
            from .html_in import Converter as html_converter
            html = html_converter()
            page = html(content, optional_args)
            elem.append(page)
            return

        self.invalid_args(elem, all_nowiki_args)
        lexer = pygments.lexers.get_lexer_by_name('text')
        blockcode = moin_page.blockcode(attrib={moin_page.class_: 'highlight'})
        pygments.highlight(content, lexer, TreeFormatter(), blockcode)
        elem.append(blockcode)
        return
Example #37
0
    def __call__(self, data, contenttype=None, arguments=None):
        """
        Convert markdown to moin DOM.

        data is a pointer to an open file (ProtectedRevision object)
        contenttype is likely == u'text/x-markdown;charset=utf-8'
        arguments is not used

        Markdown processing takes place in five steps:

        1. A bunch of "preprocessors" munge the input text.
        2. BlockParser() parses the high-level structural elements of the
           pre-processed text into an ElementTree.
        3. A bunch of "treeprocessors" are run against the ElementTree. One
           such treeprocessor runs InlinePatterns against the ElementTree,
           detecting inline markup.
        4. Some post-processors are run against the ElementTree nodes containing text
            and the ElementTree is converted to an EmeraldTree.
        5. The root of the EmeraldTree is returned.

        """
        # read the data from wiki storage and convert to unicode
        text = decode_data(data, contenttype)

        # Normalize whitespace for consistent parsing. - copied from NormalizeWhitespace in markdown/preprocessors.py
        text = text.replace(md_util.STX, "").replace(md_util.ETX, "")
        text = text.replace("\r\n", "\n").replace("\r", "\n") + "\n\n"
        text = text.expandtabs(self.markdown.tab_length)
        text = re.sub(r'(?<=\n) +\n', '\n', text)

        # save line counts for start of each block, used later for edit autoscroll
        self.count_lines(text)

        # {{{ stolen from Markdown.convert

        # Split into lines and run the line preprocessors.
        lines = text.split("\n")
        for prep in self.markdown.preprocessors.values():
            lines = prep.run(lines)

        # Parse the high-level elements, md_root is an ElementTree object
        md_root = self.markdown.parser.parseDocument(lines).getroot()

        # Run the tree-processors
        for treeprocessor in self.markdown.treeprocessors.values():
            new_md_root = treeprocessor.run(md_root)
            if new_md_root:
                md_root = new_md_root

        # }}} end stolen from Markdown.convert

        add_lineno = bool(flaskg and flaskg.add_lineno_attr)

        # run markdown post processors and convert from ElementTree to an EmeraldTree object
        converted = self.do_children(md_root, add_lineno=add_lineno)

        # convert html embedded in text strings to EmeraldTree nodes
        self.convert_embedded_markup(converted)
        # convert P-tags containing block elements to DIV-tags
        self.convert_invalid_p_nodes(converted)

        body = moin_page.body(children=converted)
        root = moin_page.page(children=[body])

        return root