Ejemplo n.º 1
0
 def test_template_in_and_after_nowiki(self):
     # The template should not be handled if inside <nowiki>
     text = 'Lorem <nowiki>{{ipsum}}</nowiki>{{ipsum}}'
     xml = preprocessToXml(text)
     correct_xml = '<root>Lorem &lt;nowiki&gt;{{ipsum}}&lt;/nowiki&gt;' \
         + '<template><title>ipsum</title></template></root>'
     self.assertEqual(xml, correct_xml)
Ejemplo n.º 2
0
 def test_tag_unclosed(self):
     tpl = 'ipsum'
     text = 'Lorem <div>'
     xml = preprocessToXml('%s{{%s}}' % (text, tpl))
     correct_xml = '<root>%s' % htmlspecialchars(text) \
         + '<template><title>%s</title></template></root>' % tpl
     self.assertEqual(xml, correct_xml)
Ejemplo n.º 3
0
 def test_template_with_argument_unicode(self):
     name = 'ølipsum'
     arg = 'ål€en'
     xml = preprocessToXml('{{%s|%s}}' % (name, arg))
     correct_xml = '<root><template><title>%s</title>' % name \
         + '<part><name index="1" /><value>%s</value></part>' % arg \
         + '</template></root>'
     self.assertEqual(xml, correct_xml)
Ejemplo n.º 4
0
 def test_template_with_tplarg(self):
     # A quite comlicated example, with a tplarg as the template name.
     # This will also involve re-adding stack elements back into the
     # stack after the }}}
     text = 'LLorem ipsum {{{{{Domino}}} | est = infinitus }}'
     xml = preprocessToXml(text)
     correct_xml = '<root>LLorem ipsum <template><title>' \
         + '<tplarg><title>Domino</title></tplarg> </title>' \
         + '<part><name> est </name>=<value> infinitus </value></part>' \
         + '</template></root>'
     self.assertEqual(xml, correct_xml)
Ejemplo n.º 5
0
 def test_nested_templates(self):
     args = ('Unus', 'Duo', 'Infinitas')
     text = '{{%s|{{%s|{{%s}}}}}}' % args
     xml = preprocessToXml('%s' % text)
     val = lambda name, val: '<template><title>%s</title>' % name \
         + {'': ''}.get(val,
                        '<part><name index="1" /><value>%s</value></part>'
                        % val) + '</template>'
     correct_xml = '<root>%s</root>' \
         % val(args[0], val(args[1], val(args[2], '')))
     self.assertEqual(xml, correct_xml)
Ejemplo n.º 6
0
def get_body_text(text):

    xml = preprocessToXml(text)
    xml = xml.replace('&lt;', '<').replace('&gt;', '>')

    root = fromstring(condition_for_lxml(xml))

    out = ''
    if root.text:
        out += root.text
    for child in root.iterchildren():
        if child.tail:
            out += child.tail

    # Strip tables
    buf = []
    depth = 0
    cpos = 0
    while True:
        openpos = out.find('{|', cpos)
        closepos = out.find('|}', cpos)
        if openpos == -1 and closepos == -1:
            break
        elif openpos == -1:
            current = {'mark': 'close', 'pos': closepos}
        elif closepos == -1:
            current = {'mark': 'open', 'pos': openpos}
        else:
            if openpos < closepos:
                current = {'mark': 'open', 'pos': openpos}
            else:
                current = {'mark': 'close', 'pos': closepos}

        if current['mark'] == 'open':
            if depth == 0:
                buf.append(out[cpos:current['pos']])
            cpos = current['pos'] + 2
            depth += 1
        else:
            cpos = current['pos'] + 2
            depth -= 1

    if depth == 0:
        buf.append(out[cpos:])
    out = ''.join(buf)

    out = re.sub(r'==[=]*', '', out)                                 # drop header markers (but keep header text)
    out = re.sub(r"''[']*", '', out)                                 # drop bold/italic markers (but keep text)

    # Note that re.sub has no flags support in python2.6, which is why we use re.compile
    rec1 = re.compile(r'^(?:#|\*).*?$', flags=re.MULTILINE)          # drop lists altogether
    out = rec1.sub('', out)

    out = re.sub(r'\[\[Kategori:[^\]]+\]\]', '', out)                # drop categories
    out = re.sub(r'(?<!\[)\[(?!\[)[^ ]+ [^\]]+\]', '', out)          # drop external links
    out = re.sub(r'\[\[(?:[^:|\]]+\|)?([^:\]]+)\]\]', '\\1', out)    # wikilinks as text, '[[Artikkel 1|artikkelen]]' -> 'artikkelen'
    out = re.sub(r'\[\[(?:Fil|File|Image|Bilde):[^\]]+\|([^\]]+)\]\]', '\\1', out)  # image descriptions only
    out = re.sub(r'\[\[[A-Za-z\-]+:[^\]]+\]\]', '', out)             # drop interwikis

    exclude = set(string.punctuation)
    out = ' '.join(ch for ch in out.split() if ch not in exclude)
    return out
Ejemplo n.º 7
0
 def test_link(self):
     text = 'Lorem [[ipsum]]'
     xml = preprocessToXml('%s' % text)
     correct_xml = '<root>%s</root>' % text
     self.assertEqual(xml, correct_xml)
Ejemplo n.º 8
0
 def test_simple_template_unicode(self):
     name = 'Lårem øpsum'
     xml = preprocessToXml('{{%s}}' % name)
     correct_xml = '<root><template><title>%s</title>' % name \
         + '</template></root>'
     self.assertEqual(xml, correct_xml)
Ejemplo n.º 9
0
 def test_linebreak(self):
     # Make sure preprocessor does not eat linebreaks
     text = '\n'
     xml = preprocessToXml(text)
     correct_xml = '<root>%s</root>' % text
     self.assertEqual(xml, correct_xml)
Ejemplo n.º 10
0
 def test_simple(self):
     text = 'Lorem ipsum'
     xml = preprocessToXml(text)
     correct_xml = '<root>%s</root>' % text
     self.assertEqual(xml, correct_xml)
Ejemplo n.º 11
0
 def test_nonmatching_braces3(self):
     text = '{{Lorem{{{ipsum}}dolor'
     xml = preprocessToXml(text)
     correct_xml = '<root>{{Lorem{<template><title>ipsum</title>' \
         + '</template>dolor</root>'
     self.assertEqual(xml, correct_xml)
Ejemplo n.º 12
0
 def test_html_tags2(self):
     text = 'Lorem<b>ipsum</b> ipsam'
     xml = preprocessToXml(text)
     correct_xml = '<root>Lorem&lt;b&gt;ipsum&lt;/b&gt; ipsam</root>'
     self.assertEqual(xml, correct_xml)
Ejemplo n.º 13
0
 def test_template_in_math(self):
     # The template should not be handled if inside <math>
     text = 'Lorem <math>{{ipsum}}</math>'
     xml = preprocessToXml('%s' % text)
     correct_xml = '<root>%s</root>' % htmlspecialchars(text)
     self.assertEqual(xml, correct_xml)
Ejemplo n.º 14
0
 def test_template_in_comment(self):
     # The template should not be handled if inside a comment
     text = 'Lorem <!--{{ipsum}}-->'
     xml = preprocessToXml('%s' % text)
     correct_xml = '<root>%s</root>' % htmlspecialchars(text)
     self.assertEqual(xml, correct_xml)
Ejemplo n.º 15
0
 def test_unclosed_template2(self):
     # Leaving out both end braces
     name = '{{Lorem ipsum'
     xml = preprocessToXml('%s' % name)
     correct_xml = '<root>%s</root>' % name
     self.assertEqual(xml, correct_xml)
Ejemplo n.º 16
0
 def test_comment_unclosed(self):
     text = 'Lorem <!-- ipsum '
     xml = preprocessToXml('%s' % text)
     correct_xml = '<root>%s</root>' % htmlspecialchars(text)
     self.assertEqual(xml, correct_xml)