Example #1
0
def test_identity():
    raw = """
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
""".decode("utf8")

    db = DummyDB()
    r = parseString(title="X33", raw=raw, wikidb=db)
    buildAdvancedTree(r)
    _treesanity(r)

    brs = r.getChildNodesByClass(BreakingReturn)
    for i, br in enumerate(brs):
        assert br in br.siblings
        assert i == _idIndex(br.parent.children, br)
        assert len([x for x in br.parent.children if x is not br]) == len(brs) - 1
        for bbr in brs:
            if br is bbr:
                continue
            assert br == bbr
            assert br is not bbr
Example #2
0
def cleanMarkupSingle(raw, cleanerMethod):
    tree = getTreeFromMarkup(raw)
    buildAdvancedTree(tree)
    tc = TreeCleaner(tree, save_reports=True)
    tc.clean([cleanerMethod])
    reports = tc.getReports()
    return (tree, reports)
Example #3
0
def test_copy():
    raw = """
===[[Leuchtturm|Leuchttürme]] auf Fehmarn===
*[[Leuchtturm Flügge]] super da
*[[Leuchtturm Marienleuchte]] da auch
*[[Leuchtturm Strukkamphuk]] spitze
*[[Leuchtturm Staberhuk]] supi
*[[Leuchtturm Westermarkelsdorf]]
""".decode(
        "utf8"
    )

    db = DummyDB()
    r = parseString(title="X33", raw=raw, wikidb=db)
    buildAdvancedTree(r)
    c = r.copy()
    _treesanity(c)

    def _check(n1, n2):
        assert n1.caption == n2.caption
        assert n1.__class__ == n2.__class__
        assert len(n1.children) == len(n2.children)
        for i, c1 in enumerate(n1):
            _check(c1, n2.children[i])

    _check(r, c)
Example #4
0
def setup():

    # WARNING, ALTERING THIS'll PROPABLY BREAK ALL TESTS! EDIT WITH CARE
    t = [Article(),
         [Section(), [PreFormatted(), [Text("bla blub"), ImageLink()]]],
         [Table(), [Row(), [Cell(), PreFormatted(), [Text("jo")]], ImageLink()]],
         [Section(), [Section(), [Strong()]]],
         [Text("bla")],
         ]
    # WARNING, ALTERING THE ABOVE PROPABLY BREAK ALL TESTS! EDIT WITH CARE

    def rec(elements, parent):
        last = None
        for c in elements:
            if isinstance(c, type([])):
                assert last
                rec(c, last)
            else:
                if parent:
                    parent.children.append(c)
                last = c

    rec(t, None)
    t = t[0]
    buildAdvancedTree(t)

    #import mwlib.parser, sys;  mwlib.parser.show(sys.stderr, t, 0)

    return t
Example #5
0
def test_ulist():
    """http://code.pediapress.com/wiki/ticket/222"""
    raw = u"""
* A item
*: B Previous item continues.
"""
    r = parseString(title='t', raw=raw)
    buildAdvancedTree(r)
#    parser.show(sys.stdout, r)
    assert len(r.getChildNodesByClass(Item)) == 1
Example #6
0
 def purify_string(self, string, title='__no_name__'):
     """
     parses the string and returns it as a list of PureSections
     """
     markup = self.templateactions.handle_templates(string, title=title)
     tree = myParseString.myParseString(title=title, raw=markup, wikidb=self.env.wiki,
                                        lang=self.env.wiki.siteinfo["general"]["lang"],
                                        uniq=self.templateactions.exp.uniquifier)
     advtree.buildAdvancedTree(tree)
     return self.purify(tree)
Example #7
0
def getXML(wikitext):
    db = DummyDB()
    r = parseString(title="test", raw=wikitext, wikidb=db)
    advtree.buildAdvancedTree(r)
    preprocess(r)
    mwlib.parser.show(sys.stdout, r)
    odfw = ODFWriter()
    odfw.writeTest(r)
    validate(odfw)
    xml = odfw.asstring()
    # print xml # usefull to inspect generateded xml
    return xml
Example #8
0
def preprocess(root):
    #advtree.buildAdvancedTree(root)
    #xmltreecleaner.removeChildlessNodes(root)
    #xmltreecleaner.fixLists(root)
    #xmltreecleaner.fixParagraphs(root)
    #xmltreecleaner.fixBlockElements(root)
    #print"*** parser raw "*5
    #parser.show(sys.stdout, root)
    #print"*** new TreeCleaner "*5
    advtree.buildAdvancedTree(root)
    tc = TreeCleaner(root)
    tc.cleanAll()
Example #9
0
def renderMW(txt, filesuffix=None):
    parseTree = uparser.parseString(title='Test', raw=txt)

    advtree.buildAdvancedTree(parseTree)
    tc = TreeCleaner(parseTree)
    tc.cleanAll()

    tmpdir = tempfile.mkdtemp()    
    rw = RlWriter(test_mode=True)
    rw.wikiTitle = 'testwiki'
    rw.tmpdir = tmpdir
    rw.imgDB = dummyImageDB(basedir=tmpdir)
    elements = rw.write(parseTree)
    renderElements(elements, filesuffix, tmpdir)
Example #10
0
def test_attributes():
    t1 = '''
{|
|- STYLE="BACKGROUND:#FFDEAD;"
|stuff
|}
'''
    r = parseString(title='t', raw=t1)
    buildAdvancedTree(r)
    n = r.getChildNodesByClass(Row)[0]
    print n.attributes, n.style
    assert isinstance(n.style, dict)
    assert isinstance(n.attributes, dict)
    assert n.style["background"] == "#FFDEAD"
Example #11
0
    def _servXML(self, args, query, dialect="mwxml"):
        if not len(args):
            self._doc(error="require articlename")
            return
        unknown = [k for k in query if k not in ("debug", "imageresolver")]
        if unknown:
            return self._doc(error="unknown option %r" % unknown)
        title = args.pop()
        base_url = "http://%s/" % ("/".join(args) or default_baseurl)
        debug = bool(query.setdefault("debug", [default_debug])[0])

        language = "en" # FIXME
        namespace="en.wikipedia.org" # FIXME

        print "_servXML", title, base_url, debug

        db = mwapidb.WikiDB(base_url)
        db.print_template = None # deactivate print template lookups
        tree = db.getParsedArticle(title, revision=None)

        if dialect == "mwxhtml":
            xhtmlwriter.preprocess(tree)
            dbw = xhtmlwriter.MWXHTMLWriter(imagesrcresolver=imagesrcresolver,
                                            debug=False)
        elif dialect == "mwxml":
            advtree.buildAdvancedTree(tree) # this should be optional
            dbw = xhtmlwriter.MWXMLWriter() # 1:1 XML from parse tree
        elif dialect == "dbxml":
            from mwlib import docbookwriter
            docbookwriter.preprocess(tree)
            dbw = docbookwriter.DocBookWriter(imagesrcresolver=imagesrcresolver,
                                            debug=debug) 
        else:
            raise Exception, "unkonwn export"


        dbw.writeBook(tree)
        if debug:
            dbw.writeparsetree(tree)

        response = dbw.asstring()
        
        self.send_response(200)
        self.send_header("Content-type", "text/xml")
        self.send_header("Content-length", str(len(response)))
        self.end_headers()
        self.wfile.write(response)
        
        # shut down the connection
        self.wfile.flush()
Example #12
0
def test_defintion_list():
    """http://code.pediapress.com/wiki/ticket/221"""
    raw = u''';termA
:descr1
'''

    for i in range(2):
        r = parseString(title='t', raw=raw)
        buildAdvancedTree(r)
        dls = r.getChildNodesByClass(DefinitionList)
        assert len(dls) == 1
        assert dls[0].getChildNodesByClass(DefinitionTerm)
        assert dls[0].getChildNodesByClass(DefinitionDescription)
        raw = raw.replace('\n', '')
Example #13
0
def cleanMarkup(raw):
    print "Parsing %r" % (raw,)

    tree = getTreeFromMarkup(raw)

    print "before treecleaner: >>>"
    showTree(tree)
    print "<<<"

    print '=' * 20
    buildAdvancedTree(tree)
    tc = TreeCleaner(tree, save_reports=True)
    tc.cleanAll(skipMethods=[])
    reports = tc.getReports()
    print "after treecleaner: >>>"
    showTree(tree)
    print "<<<"
    return (tree, reports)
Example #14
0
def test_definitiondescription():
    raw = u"""
== test ==

:One
::Two
:::Three
::::Four

"""
    db = DummyDB()
    r = parseString(title="t", raw=raw, wikidb=db)
    parser.show(sys.stdout, r)

    buildAdvancedTree(r)
    dd = r.getChildNodesByClass(DefinitionDescription)
    print "DD:", dd
    for c in dd:
        assert c.indentlevel == 1
    assert len(dd) == 4
Example #15
0
def worker(q, names, templr, noder):
    env = templr.env
    purifier = classify.Preprocessor(env, templr, noder)
    rm = nshandling.get_redirect_matcher(env.wiki.siteinfo, env.wiki.nshandler)


    articles = 0
    skipped = 0

    while True:
        try:
            name = names.get(True, 1)
            #only examine pages in the main namespace
            if env.wiki.nshandler.splitname(name)[0] == nshandling.NS_MAIN:
                raw = env.wiki.reader[name]
                #ignore redirects
                if not rm(raw):
                    raw = templr.handle_templates(raw, title=name)
                    tree = myParseString.myParseString(title=name, raw=raw, wikidb=env.wiki,
                                                       lang=env.wiki.siteinfo["general"]["lang"],
                                                       uniq=templr.exp.uniquifier)
                    advtree.buildAdvancedTree(tree)
                    #for all sections in each article
                    for s in purifier.purify(tree):
                        q.put(Record(s.heading, len(s.content)))
                    articles += 1
                else:
                    #for skipping a redirect
                    skipped += 1
            else:
                #for skipping a page not in NS_MAIN
                skipped += 1
        except Queue.Empty as excp:
            log.logger.info("examined " + str(articles) + " articles, skipped " + str(skipped))
            q.put(None)
            return
        except Exception as excp:
            msg = name + u": " + unicode(excp)
            log.logger.error(msg.encode("utf-8", "ignore"))
            if DEBUG:
                log.logger.error(traceback.format_exc())
Example #16
0
def test_removeNewlines():

    # test no action within preformattet
    t = PreFormatted()
    text = u"\t \n\t\n\n  \n\n"
    tn = Text(text)
    t.children.append(tn)
    buildAdvancedTree(t)
    _treesanity(t)
    assert tn.caption == text

    # tests remove node w/ whitespace only if at border
    t = Section()
    tn = Text(text)
    t.children.append(tn)
    buildAdvancedTree(t)
    _treesanity(t)
    #assert tn.caption == u""
    assert not t.children

    # test remove newlines
    text = u"\t \n\t\n\n KEEP  \n\n"
    t = Section()
    tn = Text(text)
    t.children.append(tn)
    buildAdvancedTree(t)
    _treesanity(t)
    assert tn.caption.count("\n") == 0
    assert len(tn.caption) == len(text)
    assert t.children
Example #17
0
def test_removeNewlines():

    # test no action within preformattet
    t = PreFormatted()
    text = u"\t \n\t\n\n  \n\n"
    tn = Text(text)
    t.children.append(tn)
    buildAdvancedTree(t)
    _treesanity(t)
    assert tn.caption == text

    # tests remove node w/ whitespace only if at border
    t = Section()
    tn = Text(text)
    t.children.append(tn)
    buildAdvancedTree(t)
    _treesanity(t)
    #assert tn.caption == u""
    assert not t.children

    # test remove newlines
    text = u"\t \n\t\n\n KEEP  \n\n"
    t = Section()
    tn = Text(text)
    t.children.append(tn)
    buildAdvancedTree(t)
    _treesanity(t)
    assert tn.caption.count("\n") == 0
    assert len(tn.caption) == len(text)
    assert t.children
Example #18
0
    def parse_and_purify(self, title, follow_redirects=False):
        """
        parses the named article and returns it as a list of PureSections, returns None for redirects
        """
        title = title.decode('utf-8')
        raw = self.env.wiki.reader[title]

        #check for redirect
        target = self.rm(raw)
        if target:
            if follow_redirects:
                logger.info(title + ' redirects to ' + target)
                return self.parse_and_purify(target)
            else:
                return None

        else:
            markup = self.templateactions.handle_templates(raw, title=title)
            tree = myParseString.myParseString(title=title, raw=markup, wikidb=self.env.wiki,
                                               lang=self.env.wiki.siteinfo["general"]["lang"],
                                               uniq=self.templateactions.exp.uniquifier)
            advtree.buildAdvancedTree(tree)
            return self.purify(tree)
Example #19
0
def test_copy():
    raw = """
===[[Leuchtturm|Leuchttürme]] auf Fehmarn===
*[[Leuchtturm Flügge]] super da
*[[Leuchtturm Marienleuchte]] da auch
*[[Leuchtturm Strukkamphuk]] spitze
*[[Leuchtturm Staberhuk]] supi
*[[Leuchtturm Westermarkelsdorf]]
""".decode("utf8")

    db = DummyDB()
    r = parseString(title="X33", raw=raw, wikidb=db)
    buildAdvancedTree(r)
    c = r.copy()
    _treesanity(c)

    def _check(n1, n2):
        assert n1.caption == n2.caption
        assert n1.__class__ == n2.__class__
        assert len(n1.children) == len(n2.children)
        for i, c1 in enumerate(n1):
            _check(c1, n2.children[i])

    _check(r, c)
Example #20
0
def setup():

    # WARNING, ALTERING THIS'll PROPABLY BREAK ALL TESTS! EDIT WITH CARE
    t = [
        Article(),
        [Section(), [PreFormatted(), [Text("bla blub"),
                                      ImageLink()]]],
        [
            Table(),
            [Row(), [Cell(), PreFormatted(), [Text("jo")]],
             ImageLink()]
        ],
        [Section(), [Section(), [Strong()]]],
        [Text("bla")],
    ]

    # WARNING, ALTERING THE ABOVE PROPABLY BREAK ALL TESTS! EDIT WITH CARE

    def rec(elements, parent):
        last = None
        for c in elements:
            if isinstance(c, type([])):
                assert last
                rec(c, last)
            else:
                if parent:
                    parent.children.append(c)
                last = c

    rec(t, None)
    t = t[0]
    buildAdvancedTree(t)

    #import mwlib.parser, sys;  mwlib.parser.show(sys.stderr, t, 0)

    return t
Example #21
0
def test_colspan():
    raw = '''<table><tr><td colspan="bogus">no colspan </td></tr></table>'''
    r = parseString(title='t', raw=raw)
    buildAdvancedTree(r)
    assert r.getChildNodesByClass(Cell)[0].colspan is 1

    raw = '''<table><tr><td colspan="-1">no colspan </td></tr></table>'''
    r = parseString(title='t', raw=raw)
    buildAdvancedTree(r)
    assert r.getChildNodesByClass(Cell)[0].colspan is 1

    raw = '''<table><tr><td colspan="2">colspan1</td></tr></table>'''
    r = parseString(title='t', raw=raw)
    buildAdvancedTree(r)
    assert r.getChildNodesByClass(Cell)[0].colspan is 2
Example #22
0
def test_colspan():
    raw = '''<table><tr><td colspan="bogus">no colspan </td></tr></table>'''
    r = parseString(title='t', raw=raw)
    buildAdvancedTree(r)
    assert r.getChildNodesByClass(Cell)[0].colspan is 1

    raw = '''<table><tr><td colspan="-1">no colspan </td></tr></table>'''
    r = parseString(title='t', raw=raw)
    buildAdvancedTree(r)
    assert r.getChildNodesByClass(Cell)[0].colspan is 1

    raw = '''<table><tr><td colspan="2">colspan1</td></tr></table>'''
    r = parseString(title='t', raw=raw)
    buildAdvancedTree(r)
    assert r.getChildNodesByClass(Cell)[0].colspan is 2
def preprocess(root):
    advtree.buildAdvancedTree(root)
    tc = TreeCleaner(root)
    tc.cleanAll()
Example #24
0
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import time

s = unicode(open(sys.argv[1], "rb").read(), "utf-8")

from mwlib import uparser, advtree, treecleaner
from mwlib.refine import compat

stime = time.time()
r = compat.parse_txt(s)
print "parse:", time.time() - stime

stime = time.time()
advtree.buildAdvancedTree(r)
print "tree", time.time() - stime

stime = time.time()
tc = treecleaner.TreeCleaner(r)
tc.cleanAll()
print "clean:", time.time() - stime
Example #25
0
def buildAdvTree(raw):
    tree = getTreeFromMarkup(raw)
    advtree.buildAdvancedTree(tree)
    tc = TreeCleaner(tree, save_reports=True)
    tc.cleanAll(skipMethods=[])
    return tree
Example #26
0
def preprocess(root):
    advtree.buildAdvancedTree(root)
    tc = TreeCleaner(root)
    tc.cleanAll()
Example #27
0
def buildAdvTree(raw):
    tree = getTreeFromMarkup(raw)
    advtree.buildAdvancedTree(tree)
    tc = TreeCleaner(tree, save_reports=True)
    tc.cleanAll(skipMethods=[])
    return tree
Example #28
0
def getAdvTree(raw):
    tree = parseString(title='test', raw=raw)
    buildAdvancedTree(tree)
    return tree
Example #29
0
def getTreeFromMarkup(raw):
    tree = parseString(title="Test", raw=raw, wikidb=DummyDB())
    advtree.buildAdvancedTree(tree)
    return tree
Example #30
0
def getTreeFromMarkup(raw):
    tree = parseString(title="Test", raw=raw, wikidb=DummyDB())
    advtree.buildAdvancedTree(tree)
    return tree
Example #31
0
def preprocess(root):
    advtree.buildAdvancedTree(root)
    xmltreecleaner.removeChildlessNodes(root)
    xmltreecleaner.fixLists(root)
    xmltreecleaner.fixParagraphs(root)
    xmltreecleaner.fixBlockElements(root)
Example #32
0
{{Harv|Smith|2006| p=25}}

"Cleanup" expands to "Asbox" that should be removed<br />
{{Cleanup}}

<h2>Markup tests</h2>
I wonder what happens to the heading above...
== Wiki style header ==
As opposed to this one


 """
    name = "Test"

    if args.file:
        name = args.article
        markup = util.file2s(args.article)
    else:
        page = env.wiki.get_page(args.article)
        raw = page.rawtext
        name = page.names[-1]
        markup =  act.handle_templates(raw, title=name)

    tree = myParseString.myParseString(title=name, raw=markup, wikidb=env.wiki,
                                       lang=env.wiki.siteinfo["general"]["lang"], uniq=act.exp.uniquifier)
    advtree.buildAdvancedTree(tree)

    sections = purifier.purify(tree)
    for s in sections:
        print s
Example #33
0
def getAdvTree(raw):
    tree = parseString(title='test', raw=raw)
    buildAdvancedTree(tree)
    return tree
Example #34
0
def preprocess(root):
    advtree.buildAdvancedTree(root)
    xmltreecleaner.removeChildlessNodes(root)
    xmltreecleaner.fixLists(root)
    xmltreecleaner.fixParagraphs(root)
    xmltreecleaner.fixBlockElements(root)