Example #1
0
def test_colspan():
    raw = '''<table><tr><td colspan="bogus">no colspan </td></tr></table>'''
    r = parseString(title='t', raw=raw)
    buildAdvancedTree(r)
    assert r.getChildNodesByClass(Cell)[0].colspan is 1

    raw = '''<table><tr><td colspan="-1">no colspan </td></tr></table>'''
    r = parseString(title='t', raw=raw)
    buildAdvancedTree(r)
    assert r.getChildNodesByClass(Cell)[0].colspan is 1

    raw = '''<table><tr><td colspan="2">colspan1</td></tr></table>'''
    r = parseString(title='t', raw=raw)
    buildAdvancedTree(r)
    assert r.getChildNodesByClass(Cell)[0].colspan is 2
Example #2
0
def test_colspan():
    raw = '''<table><tr><td colspan="bogus">no colspan </td></tr></table>'''
    r = parseString(title='t', raw=raw)
    buildAdvancedTree(r)
    assert r.getChildNodesByClass(Cell)[0].colspan is 1

    raw = '''<table><tr><td colspan="-1">no colspan </td></tr></table>'''
    r = parseString(title='t', raw=raw)
    buildAdvancedTree(r)
    assert r.getChildNodesByClass(Cell)[0].colspan is 1

    raw = '''<table><tr><td colspan="2">colspan1</td></tr></table>'''
    r = parseString(title='t', raw=raw)
    buildAdvancedTree(r)
    assert r.getChildNodesByClass(Cell)[0].colspan is 2
Example #3
0
def convert_pagecontent(title, content):
    """
    Convert a string in Mediawiki content format to a string in
    Dokuwiki content format.
    """

    # this is a hack for mwlib discarding the content of <nowiki> tags
    # and replacing them with plaintext parsed HTML versions of the
    # content (pragmatic, but not what we want)
    nowiki_plaintext = []

    # Instead we save the content here, replace it with the "magic" placeholder
    # tag <__yamdwe_nowiki> and the index where the content was saved, then pass
    # the list of nowiki content into the parser as context.
    def add_nowiki_block(match):
        nowiki_plaintext.append(match.group(0))
        return "<__yamdwe_nowiki>%d</__yamdwe_nowiki>" % (len(nowiki_plaintext)-1,)
    content = re.sub(r"<nowiki>.+?</nowiki>", add_nowiki_block, content)

    root = uparser.parseString(title, content) # create parse tree
    context = {}
    context["list_stack"] = []
    context["nowiki_plaintext"] = nowiki_plaintext # hacky way of attaching to child nodes
    result = convert(root, context, False)

    # mwlib doesn't parse NOTOC, so check for it manually
    if re.match(r"^\s*__NOTOC__\s*$", content, re.MULTILINE):
        result = "~~NOTOC~~"+("\n" if not result.startswith("\n") else "")+result
    return result
Example #4
0
def test_image_link():
    t = uparser.parseString('',
                            u'[[画像:Tajima mihonoura03s3200.jpg]]',
                            lang='ja')
    r = t.find(parser.ImageLink)[0]
    assert r.target == u'画像:Tajima mihonoura03s3200.jpg'
    assert r.namespace == 6, "wrong namespace"
Example #5
0
def convert_pagecontent(title, content):
    """
    Convert a string in Mediawiki content format to a string in
    Dokuwiki content format.
    """

    # this is a hack for mwlib discarding the content of <nowiki> tags
    # and replacing them with plaintext parsed HTML versions of the
    # content (pragmatic, but not what we want)
    nowiki_plaintext = []

    # Instead we save the content here, replace it with the "magic" placeholder
    # tag <__yamdwe_nowiki> and the index where the content was saved, then pass
    # the list of nowiki content into the parser as context.
    def add_nowiki_block(match):
        nowiki_plaintext.append(match.group(0))
        return "<__yamdwe_nowiki>%d</__yamdwe_nowiki>" % (
            len(nowiki_plaintext) - 1, )

    content = re.sub(r"<nowiki>.+?</nowiki>", add_nowiki_block, content)

    root = uparser.parseString(title, content)  # create parse tree
    context = {}
    context["list_stack"] = []
    context[
        "nowiki_plaintext"] = nowiki_plaintext  # hacky way of attaching to child nodes
    result = convert(root, context, False)

    # mwlib doesn't parse NOTOC, so check for it manually
    if re.match(r"^\s*__NOTOC__\s*$", content, re.MULTILINE):
        result = "~~NOTOC~~" + ("\n" if not result.startswith("\n") else
                                "") + result
    return result
Example #6
0
def test_identity():
    raw = """
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
""".decode("utf8")

    db = DummyDB()
    r = parseString(title="X33", raw=raw, wikidb=db)
    buildAdvancedTree(r)
    _treesanity(r)

    brs = r.getChildNodesByClass(BreakingReturn)
    for i, br in enumerate(brs):
        assert br in br.siblings
        assert i == _idIndex(br.parent.children, br)
        assert len([x for x in br.parent.children if x is not br]) == len(brs) - 1
        for bbr in brs:
            if br is bbr:
                continue
            assert br == bbr
            assert br is not bbr
Example #7
0
def test_identity():
    raw = """
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
""".decode("utf8")

    db = DummyDB()
    r = parseString(title="X33", raw=raw, wikidb=db)
    buildAdvancedTree(r)
    _treesanity(r)

    brs = r.getChildNodesByClass(BreakingReturn)
    for i, br in enumerate(brs):
        assert br in br.siblings
        assert i == _idIndex(br.parent.children, br)
        assert len([x for x in br.parent.children if x is not br]) == len(brs) - 1
        for bbr in brs:
            if br is bbr:
                continue
            assert br == bbr
            assert br is not bbr
Example #8
0
def test_copy():
    raw = """
===[[Leuchtturm|Leuchttürme]] auf Fehmarn===
*[[Leuchtturm Flügge]] super da
*[[Leuchtturm Marienleuchte]] da auch
*[[Leuchtturm Strukkamphuk]] spitze
*[[Leuchtturm Staberhuk]] supi
*[[Leuchtturm Westermarkelsdorf]]
""".decode(
        "utf8"
    )

    db = DummyDB()
    r = parseString(title="X33", raw=raw, wikidb=db)
    buildAdvancedTree(r)
    c = r.copy()
    _treesanity(c)

    def _check(n1, n2):
        assert n1.caption == n2.caption
        assert n1.__class__ == n2.__class__
        assert len(n1.children) == len(n2.children)
        for i, c1 in enumerate(n1):
            _check(c1, n2.children[i])

    _check(r, c)
Example #9
0
    def getUserLinks(raw):
        def isUserLink(node):
            return isinstance(node, parser.NamespaceLink) and node.namespace == 2  # NS_USER

        result = list(set([u.target for u in uparser.parseString(title, raw=raw, wikidb=wikidb).filter(isUserLink)]))
        result.sort()
        return result
Example #10
0
def parse_tree(file_name):
    dp = dumpparser.DumpParser("data/wiki-data.xml")
    data = []
    for recipe in dp:
        data.append( (recipe.pageid,
                      recipe.title,
                      parser.parseString(recipe.text, recipe.text)) )
    return data
Example #11
0
def test_tag_expand_vs_uniq():
    db = DictDB(
        Foo = """{{#tag:pre|inside pre}}"""
        )
    r=uparser.parseString(title="Foo", wikidb=db)
    core.show(r)
    pre = r.find(parser.PreFormatted)
    assert len(pre)==1, "expected a preformatted node"
Example #12
0
 def to_html(cls, kb_entry):
     r = kb_entry.body.replace("\r", "")
     parsed = parseString(title=kb_entry.subject, raw=r, wikidb=cls.NOCDB(kb_entry))
     preprocess(parsed)
     xhtml = MWXHTMLWriter()
     xhtml.writeBook(parsed)
     block = ET.tostring(xhtml.xmlbody)
     return block
Example #13
0
def test_tag_expand_vs_uniq():
    db = DictDB(
        Foo="""{{#tag:pre|inside pre}}"""
        )
    r = uparser.parseString(title="Foo", wikidb=db)
    core.show(r)
    pre = r.find(parser.PreFormatted)
    assert len(pre) == 1, "expected a preformatted node"
Example #14
0
def getXHTML(wikitext):
    db = DummyDB()
    r = parseString(title="", raw=wikitext, wikidb=db)
    preprocess(r)
    dbw = MWXHTMLWriter()
    with SuppressOutput():
        dbw.writeBook(r)
    return dbw.asstring()
Example #15
0
def getAdvTree(fn):
    from mwlib.dummydb import DummyDB
    from mwlib.uparser import parseString
    db = DummyDB()
    input = unicode(open(fn).read(), 'utf8')
    r = parseString(title=fn, raw=input, wikidb=db)
    buildAdvancedTree(r)
    return r
Example #16
0
def getXHTML(wikitext):
    db = DummyDB()
    r = parseString(title="test", raw=wikitext, wikidb=db)
    preprocess(r)
    show(sys.stdout, r)
    dbw = MWXHTMLWriter()
    dbw.writeBook(r)
    return dbw.asstring()
Example #17
0
def parse():
    parser = optparse.OptionParser(
        usage="%prog [-a|--all] --config CONFIG [ARTICLE1 ...]")
    parser.add_option("-a",
                      "--all",
                      action="store_true",
                      help="parse all articles")
    parser.add_option("--tb",
                      action="store_true",
                      help="show traceback on error")

    parser.add_option("-c", "--config", help="configuration file/URL/shortcut")

    options, args = parser.parse_args()

    if not args and not options.all:
        parser.error("missing option.")

    if not options.config:
        parser.error("missing --config argument")

    articles = [unicode(x, 'utf-8') for x in args]

    conf = options.config

    import traceback
    from mwlib import wiki, uparser

    w = wiki.makewiki(conf)

    db = w.wiki

    if options.all:
        if not hasattr(db, "articles"):
            raise RuntimeError(
                "%s does not support iterating over all articles" % (db, ))
        articles = db.articles()

    import time
    for x in articles:
        try:
            page = db.normalize_and_get_page(x, 0)
            if page:
                raw = page.rawtext
            else:
                raw = None

            # yes, raw can be None, when we have a redirect to a non-existing article.
            if raw is None:
                continue
            stime = time.time()
            a = uparser.parseString(x, raw=raw, wikidb=db)
        except Exception as err:
            print "F", repr(x), err
            if options.tb:
                traceback.print_exc()
        else:
            print "G", time.time() - stime, repr(x)
Example #18
0
def parse_wiki(name, wiki, make_math_png=False):
    c = cdbwiki.WikiDB(default_wiki_dir)
    a = uparser.parseString(name, raw=wiki, wikidb=c)
    out = StringIO.StringIO()
    mr = rendermath.Renderer(basedir=default_math_dir,
            lazy=(not make_math_png))
    w = htmlwriter.HTMLWriter(out, images=None, math_renderer=mr)
    w.write(a)
    return out.getvalue()
Example #19
0
def simpleparse(raw):    # !!! USE FOR DEBUGGING ONLY !!! 
    import sys
    from mwlib import dummydb, parser
    from mwlib.uparser import parseString
    input = raw.decode('utf8')
    r = parseString(title="title", raw=input, wikidb=dummydb.DummyDB())
    buildAdvancedTree(r)
    parser.show(sys.stdout, r, 0)
    return r
 def getParsedArticle(self, title, revision=None):
     raw = self.getRawArticle(title, revision=revision)
     if raw is None:
         return None
     article = self._getArticle(title, revision=revision)
     lang = None
     source = self.getSource(title, revision=revision)
     if source is not None:
         lang = source.get('language')
     return uparser.parseString(title=title, raw=raw, wikidb=self, lang=lang)
Example #21
0
def test_ulist():
    """http://code.pediapress.com/wiki/ticket/222"""
    raw = u"""
* A item
*: B Previous item continues.
"""
    r = parseString(title='t', raw=raw)
    buildAdvancedTree(r)
#    parser.show(sys.stdout, r)
    assert len(r.getChildNodesByClass(Item)) == 1
Example #22
0
def test_ulist():
    """http://code.pediapress.com/wiki/ticket/222"""
    raw = u"""
* A item
*: B Previous item continues.
"""
    r = parseString(title='t', raw=raw)
    buildAdvancedTree(r)
#    parser.show(sys.stdout, r)
    assert len(r.getChildNodesByClass(Item)) == 1
Example #23
0
def getXHTML(wikitext, title, language):
    db = DummyDB()
    db.normalize_and_get_page = noop
    r = parseString(title=title, raw=wikitext, wikidb=db, lang=language)
    if not r:
        return None
    preprocess(r)
    removeLangLinks(r)
    dbw = MWXHTMLWriter()
    dbw.writeBook(r)
    return dbw.asstring()
def getXHTML(wikitext, title, language):
    db = DummyDB()
    db.normalize_and_get_page = noop
    r = parseString(title=title, raw=wikitext, wikidb=db, lang=language)
    if not r:
        return None
    preprocess(r)
    removeLangLinks(r)
    dbw = MWXHTMLWriter()
    dbw.writeBook(r)
    return dbw.asstring()
Example #25
0
def getXML(wikitext):
    db = DummyDB()
    r = parseString(title="test", raw=wikitext, wikidb=db)
    print "before preprocess"
    show(sys.stdout, r)
    preprocess(r)
    print "after preprocess"
    show(sys.stdout, r)
    dbw = DocBookWriter()
    dbw.dbwriteArticle(r)
    return dbw.asstring()
Example #26
0
	def parse(self, title, text):
		out = StringIO.StringIO()
		metadata = defaultdict(list)		

		parsed = parseString(title, raw=text, wikidb = self.db)
		w = htmlwriter.HTMLWriter(out, metadata ,  self.options)
		print metadata
		w.write(parsed)
#		meta_data = [ ('key-word', ['first']), ("category", ['pierwsza', 'druga', 'trzecia']) ]
		
		return (metadata, out.getvalue())
Example #27
0
def main(titulo, archin, archout):
    out = codecs.open(archout, "w", "utf8")

    inp = codecs.open(archin, "r", "utf8")
    article = inp.read()
    inp.close()

    p = uparser.parseString(titulo, raw=article, wikidb=dummydb.DummyDB())

    w = htmlwriter.HTMLWriter(out)
    w.write(p)
    out.close()
Example #28
0
def getXML(wikitext):
    db = DummyDB()
    r = parseString(title="test", raw=wikitext, wikidb=db)
    advtree.buildAdvancedTree(r)
    preprocess(r)
    mwlib.parser.show(sys.stdout, r)
    odfw = ODFWriter()
    odfw.writeTest(r)
    validate(odfw)
    xml = odfw.asstring()
    # print xml # usefull to inspect generateded xml
    return xml
Example #29
0
    def getUserLinks(raw):
        def isUserLink(node):
            return isinstance(node, parser.NamespaceLink) and node.namespace == 2  # NS_USER

        result = sorted(set([
            u.target
            for u in uparser.parseString(title,
                                         raw=raw,
                                         wikidb=wikidb,
                                         ).filter(isUserLink)
        ]))
        return result
Example #30
0
def convert_pagecontent(title, content):
    """
    Convert a string in Mediawiki content format to a string in
    Dokuwiki content format.
    """
    # wrap the "magic" marker tag <__mw_nowiki> around <nowiki>, as
    # as mwlib just discards it otherwise and we can't detect it within the parser.
    # We keep the inner <nowiki> so the mwlib parser still skips that content
    content = re.sub(r"<nowiki>.+</nowiki>", lambda e: "<__mw_nowiki>"+e.group(0)+"</__mw_nowiki>", content)

    root = uparser.parseString(title, content) # create parse tree
    return convert(root, False)
Example #31
0
def parse():
    parser = optparse.OptionParser(usage="%prog [-a|--all] --config CONFIG [ARTICLE1 ...]")
    parser.add_option("-a", "--all", action="store_true", help="parse all articles")
    parser.add_option("--tb", action="store_true", help="show traceback on error")

    parser.add_option("-c", "--config", help="configuration file/URL/shortcut")

    options, args = parser.parse_args()
                                   
    if not args and not options.all:
        parser.error("missing option.")
        
    if not options.config:
        parser.error("missing --config argument")

    articles = [unicode(x, 'utf-8') for x in args]

    conf = options.config
    
    import traceback
    from mwlib import wiki, uparser
    
    w = wiki.makewiki(conf)
    
    db = w.wiki

    if options.all:
        if not hasattr(db, "articles"):
            raise RuntimeError("%s does not support iterating over all articles" % (db, ))
        articles = db.articles()


    import time
    for x in articles:
        try:
            page = db.normalize_and_get_page(x, 0)
            if page:
                raw = page.rawtext
            else:
                raw = None
                
            # yes, raw can be None, when we have a redirect to a non-existing article.
            if raw is None: 
                continue
            stime=time.time()
            a=uparser.parseString(x, raw=raw, wikidb=db)
        except Exception, err:
            print "F", repr(x), err
            if options.tb:
                traceback.print_exc()
        else:
            print "G", time.time()-stime, repr(x)
Example #32
0
def main():
    segmenter = MediaWikiWikiSegmenter()
    templdb = nuwiki.adapt(WikiDB(templdbPath, lang="ja"))

    contentdb = WikiDB(contentdbPath, lang="ja")

    for title, text in contentdb.reader.iteritems():
        tree = parseString(title=title, raw=text, wikidb=templdb)

        print >>sys.stderr, title.encode("utf-8")
        output = segmenter.traverse(tree, [], 0)
        output = segmenter.cleanOutput(output)
        segmenter.printOutput(output, False)
Example #33
0
def test_attributes():
    t1 = '''
{|
|- STYLE="BACKGROUND:#FFDEAD;"
|stuff
|}
'''
    r = parseString(title='t', raw=t1)
    buildAdvancedTree(r)
    n = r.getChildNodesByClass(Row)[0]
    print n.attributes, n.style
    assert isinstance(n.style, dict)
    assert isinstance(n.attributes, dict)
    assert n.style["background"] == "#FFDEAD"
Example #34
0
def test_defintion_list():
    """http://code.pediapress.com/wiki/ticket/221"""
    raw = u''';termA
:descr1
'''

    for i in range(2):
        r = parseString(title='t', raw=raw)
        buildAdvancedTree(r)
        dls = r.getChildNodesByClass(DefinitionList)
        assert len(dls) == 1
        assert dls[0].getChildNodesByClass(DefinitionTerm)
        assert dls[0].getChildNodesByClass(DefinitionDescription)
        raw = raw.replace('\n', '')
Example #35
0
def convert_pagecontent(title, content):
    """
    Convert a string in Mediawiki content format to a string in
    Dokuwiki content format.
    """
    # wrap the "magic" marker tag <__mw_nowiki> around <nowiki>, as
    # as mwlib just discards it otherwise and we can't detect it within the parser.
    # We keep the inner <nowiki> so the mwlib parser still skips that content
    content = re.sub(r"<nowiki>.+</nowiki>",
                     lambda e: "<__mw_nowiki>" + e.group(0) + "</__mw_nowiki>",
                     content)

    root = uparser.parseString(title, content)  # create parse tree
    return convert(root, False)
Example #36
0
def main():
    for fn in sys.argv[1:]:
        from mwlib.dummydb import DummyDB
        from mwlib.uparser import parseString
        db = DummyDB()
        input = unicode(open(fn).read(), 'utf8')
        r = parseString(title=fn, raw=input, wikidb=db)
        parser.show(sys.stdout, r)
        preprocess(r)
        parser.show(sys.stdout, r)
        dbw = MWXHTMLWriter()
        dbw.writeBook(r)
        nf = open("%s.html" % fn, "w")
        nf.write(dbw.asstring())
Example #37
0
def html():
    parser = optparse.OptionParser(usage="%prog --conf CONF ARTICLE [...]")
    parser.add_option("-c", "--conf", help="config file")

    options, args = parser.parse_args()
    
    if not args:
        parser.error("missing ARTICLE argument")
        
    articles = [unicode(x, 'utf-8') for x in args]

    conf = options.conf
    if not options.conf:
        parser.error("missing --conf argument")
    
    import StringIO
    import tempfile
    import os
    import webbrowser
    from mwlib import wiki, uparser, htmlwriter
    
    res = wiki.makewiki(conf)
    db = res['wiki']
    images = res['images']

    for a in articles:
        raw=db.getRawArticle(a)
        if not raw:
            continue

        out=StringIO.StringIO()
        out.write("""<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<meta http-equiv="content-type" content="text/html; charset="utf-8"></meta>
<link rel="stylesheet" href="pedia.css" />
</head>
<body>

""")

        a=uparser.parseString(x, raw=raw, wikidb=db)
        w=htmlwriter.HTMLWriter(out, images)
        w.write(a)

        fd, htmlfile = tempfile.mkstemp(".html")
        os.close(fd)
        open(htmlfile, "wb").write(out.getvalue().encode('utf-8'))
        webbrowser.open("file://"+htmlfile)
Example #38
0
def main():
    for fn in sys.argv[1:]:
        from mwlib.dummydb import DummyDB
        from mwlib.uparser import parseString
        db = DummyDB()
        input = unicode(open(fn).read(), 'utf8')
        r = parseString(title=fn, raw=input, wikidb=db)
        parser.show(sys.stdout, r)
        preprocess(r)
        parser.show(sys.stdout, r)
        dbw = MWXHTMLWriter()
        dbw.writeBook(r)
        nf = open("%s.html" % fn, "w")
        nf.write(dbw.asstring())
Example #39
0
    def parse(self):
        """ Create a parse tree and then extract data for article from it. """
        # if the page was missing, return
        if self.missing():
            return

        self.parsetree = uparser.parseString(title=self.title,
                                             raw=self.wikitext)

        text, links = get_text_and_links(self.parsetree, self.ignoreSections)
        plaintext = u''.join(text)
        # Remove newlines and spaces that occur at beginning of text
        self.plaintext = plaintext.lstrip(' \n')
        self.links = links
Example #40
0
def html():
    parser = optparse.OptionParser(usage="%prog --conf CONF ARTICLE [...]")
    parser.add_option("-c", "--conf", help="config file")

    options, args = parser.parse_args()

    if not args:
        parser.error("missing ARTICLE argument")

    articles = [unicode(x, 'utf-8') for x in args]

    conf = options.conf
    if not options.conf:
        parser.error("missing --conf argument")

    import StringIO
    import tempfile
    import os
    import webbrowser
    from mwlib import wiki, uparser, htmlwriter

    res = wiki.makewiki(conf)
    db = res['wiki']
    images = res['images']

    for a in articles:
        raw = db.getRawArticle(a)
        if not raw:
            continue

        out = StringIO.StringIO()
        out.write("""<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<meta http-equiv="content-type" content="text/html; charset="utf-8"></meta>
<link rel="stylesheet" href="pedia.css" />
</head>
<body>

""")

        a = uparser.parseString(x, raw=raw, wikidb=db)
        w = htmlwriter.HTMLWriter(out, images)
        w.write(a)

        fd, htmlfile = tempfile.mkstemp(".html")
        os.close(fd)
        open(htmlfile, "wb").write(out.getvalue().encode('utf-8'))
        webbrowser.open("file://" + htmlfile)
Example #41
0
def test_attributes():
    t1 = '''
{|
|- STYLE="BACKGROUND:#FFDEAD;"
|stuff
|}
'''
    r = parseString(title='t', raw=t1)
    buildAdvancedTree(r)
    n = r.getChildNodesByClass(Row)[0]
    print n.attributes, n.style
    assert isinstance(n.style, dict)
    assert isinstance(n.attributes, dict)
    assert n.style["background"] == "#FFDEAD"
Example #42
0
def renderMW(txt, filesuffix=None):
    parseTree = uparser.parseString(title='Test', raw=txt)

    advtree.buildAdvancedTree(parseTree)
    tc = TreeCleaner(parseTree)
    tc.cleanAll()

    tmpdir = tempfile.mkdtemp()    
    rw = RlWriter(test_mode=True)
    rw.wikiTitle = 'testwiki'
    rw.tmpdir = tmpdir
    rw.imgDB = dummyImageDB(basedir=tmpdir)
    elements = rw.write(parseTree)
    renderElements(elements, filesuffix, tmpdir)
Example #43
0
def test_defintion_list():
    """http://code.pediapress.com/wiki/ticket/221"""
    raw = u''';termA
:descr1
'''

    for i in range(2):
        r = parseString(title='t', raw=raw)
        buildAdvancedTree(r)
        dls = r.getChildNodesByClass(DefinitionList)
        assert len(dls) == 1
        assert dls[0].getChildNodesByClass(DefinitionTerm)
        assert dls[0].getChildNodesByClass(DefinitionDescription)
        raw = raw.replace('\n', '')
Example #44
0
 def _parse_wiki(input):
     db = DummyDB()
 
     out = StringIO.StringIO()
 
     if input.endswith(chr(13) + chr(10)):
         input = input.replace(chr(13) + chr(10), chr(10))
     if input.endswith(chr(13)):
         input = input.replace(chr(13), chr(10))
 
     try:
         p = parseString("title", input.decode("utf8"))
     except Exception, ex:
         raise ex
         return u'Unable to parse input!'
Example #45
0
    def _parse_wiki(input):
        db = DummyDB()

        out = StringIO.StringIO()

        if input.endswith(chr(13) + chr(10)):
            input = input.replace(chr(13) + chr(10), chr(10))
        if input.endswith(chr(13)):
            input = input.replace(chr(13), chr(10))

        try:
            p = parseString("title", input.decode("utf8"))
        except Exception, ex:
            raise ex
            return u'Unable to parse input!'
Example #46
0
 def run(self):
     raw = u'\n'.join(self.content)
     # empty wikidb
     db = DummyDB()
     # run parser and pre-processors
     parsed = parseString(title='Export', raw=raw, wikidb=db)
     preprocess(parsed)
     # write XHTML
     xhtml = MWXHTMLWriter()
     xhtml.writeBook(parsed)
     # remove the H1 heading (title) from the document
     article = xhtml.xmlbody.getchildren()[0]
     article.remove(article.getchildren()[0]) # remove caption
     # render to string
     block = ET.tostring(xhtml.xmlbody)
     return [nodes.raw('', block, format='html')]
Example #47
0
def get_xhtml(wikitext):
    r = parseString(title="", raw=wikitext)
    preprocess(r)
    dbw = MyWriter()
    dbw.writeBook(r)
    text = dbw.asstring()
    text = re.sub('<p />', '', text)
    text = re.sub('<p> ', '<p>', text)
    text = re.sub(' </p>', '</p>', text)
    text = re.sub('</p><p>', '</p>\n<p>', text)
    text = re.sub(' <br /> &#160;&#160;&#160;&#160;&#160; ', '</p>\n<p>', text)
    text = re.sub('&#160;&#160;&#160;&#160;&#160; ', '<p>', text)
    text = re.sub('</dd><dd>', '</dd>\n<dd>', text)
    text = re.sub('<body><div class="mwx.article"><h1 />', '', text)
    text = re.sub('</div></body>', '', text)
    return text
Example #48
0
 def to_html(cls, kb_entry):
     from mwlib.uparser import parseString
     from mwlib.xhtmlwriter import MWXHTMLWriter, preprocess
     try:
         import xml.etree.ElementTree as ET
     except:
         from elementtree import ElementTree as ET
     r = kb_entry.body.replace("\r", "")
     parsed = parseString(title=kb_entry.subject,
                          raw=r,
                          wikidb=cls.NOCDB(kb_entry))
     preprocess(parsed)
     xhtml = MWXHTMLWriter()
     xhtml.writeBook(parsed)
     block = ET.tostring(xhtml.xmlbody)
     return block
Example #49
0
    def parseArticle(
        self,
        title,
        revision=None,
        raw=None,
        wikidb=None,
        imagedb=None,
    ):
        """Parse article with given title, revision and raw wikitext, adding all
        referenced templates and images, but not adding the article itself.
        
        @param title: title of article
        @type title: unicode
        
        @param revision: revision of article (optional)
        @type revision: int
        
        @param raw: wikitext of article
        @type raw: unicode
        
        @param wikidb: WikiDB to use
        
        @param imagedb: ImageDB to use (optional)
        """

        recorddb = RecordDB(wikidb, self.articles, self.templates,
                            self.sources)
        parse_tree = uparser.parseString(
            title,
            revision=revision,
            raw=raw,
            wikidb=recorddb,
        )
        if imagedb is None:
            return
        for node in parse_tree.allchildren():
            if isinstance(node, parser.ImageLink):
                self.addImage(node.target, imagedb=imagedb, wikidb=wikidb)
            elif isinstance(node,
                            parser.TagNode) and node.caption == 'imagemap':
                imagemap = getattr(node, 'imagemap', None)
                if imagemap is not None:
                    imagelink = getattr(imagemap, 'imagelink', None)
                    if imagelink is not None:
                        self.addImage(imagelink.target,
                                      imagedb=imagedb,
                                      wikidb=wikidb)
    def parseArticle(self, title,
        revision=None,
        raw=None,
        wikidb=None,
        imagedb=None,
    ):
        """Parse article with given title, revision and raw wikitext, adding all
        referenced templates and images, but not adding the article itself.
        
        @param title: title of article
        @type title: unicode
        
        @param revision: revision of article (optional)
        @type revision: int
        
        @param raw: wikitext of article
        @type raw: unicode
        
        @param wikidb: WikiDB to use
        
        @param imagedb: ImageDB to use (optional)
        """
        
        recorddb = RecordDB(wikidb, self.articles, self.templates, self.sources)
        parse_tree = uparser.parseString(title,
            revision=revision,
            raw=raw,
            wikidb=recorddb,
        )
        if imagedb is None:
            return

        stats = self.node_stats
        for node in parse_tree.allchildren():
            if isinstance(node, parser.ImageLink):
                self.image_infos.add((node.target, imagedb, wikidb))
            elif isinstance(node, parser.TagNode) and node.caption == 'imagemap':
                imagemap = getattr(node, 'imagemap', None)
                if imagemap is not None:
                    imagelink = getattr(imagemap, 'imagelink', None)
                    if imagelink is not None:
                        self.image_infos.add((imagelink.target, imagedb, wikidb))
            # stats
            k, w  = utils.get_nodeweight(node)
            stats[k] = stats.get(k, 0) + w
Example #51
0
    def get_authors_from_template_args(template):
        args = get_template_args(template, expander)

        author_arg = args.get('Author', None)
        if author_arg:
            # userlinks = getUserLinks(author_arg)
            # if userlinks:
            #     return userlinks
            node = uparser.parseString('', raw=args['Author'], wikidb=wikidb)
            advtree.extendClasses(node)
            txt = node.getAllDisplayText().strip()
            if txt:
                return [txt]

        if args.args:
            return getUserLinks('\n'.join([args.get(i, u'') for i in range(len(args.args))]))

        return []
Example #52
0
    def getParsedArticle(self, title, revision=None):
        if revision:
            page = self.nuwiki.get_page(None, revision)
        else:
            page = self.normalize_and_get_page(title, 0)

        if page:
            raw = page.rawtext
        else:
            raw = None
            
            
        if raw is None:
            return None

        from mwlib import uparser        

        return uparser.parseString(title=title, raw=raw, wikidb=self, lang=self.siteinfo["general"]["lang"])
Example #53
0
def print_tree(wikitext):
    """Print all of the nodes in the parse tree created from the wikitext."""
    clean_wiki = wiki_parser.remove_templates(wiki_parser.unescape(wikitext))
    tree = uparser.parseString(title='', raw=clean_wiki)
    result = ''
    node_stack = deque([(tree, 0)])
    while len(node_stack) > 0:
        (node, level) = node_stack.popleft()
        node_str = str(node)
        if hasattr(node, 'type'):
            node_str = str(node.type) + ' | ' + str(node)
        result += (level * '\t') + node_str + '\n'
        children = deque([])
        for c in node.children:
            children.appendleft((c, level + 1))
        node_stack.extendleft(children)

    return result
Example #54
0
def main():
    for fn in sys.argv[1:]:

        from mwlib.dummydb import DummyDB
        from mwlib.uparser import parseString
        db = DummyDB()
        input = unicode(open(fn).read(), 'utf8')
        r = parseString(title=fn, raw=input, wikidb=db)
        #parser.show(sys.stdout, r)
        #advtree.buildAdvancedTree(r)
        #tc = TreeCleaner(r)
        #tc.cleanAll()

        preprocess(r)
        parser.show(sys.stdout, r)
        odf = ODFWriter()
        odf.writeTest(r)
        doc = odf.getDoc()
        #doc.toXml("%s.xml"%fn)
        doc.save(fn, True)
Example #55
0
def test_definitiondescription():
    raw = u"""
== test ==

:One
::Two
:::Three
::::Four

"""
    db = DummyDB()
    r = parseString(title="t", raw=raw, wikidb=db)
    parser.show(sys.stdout, r)

    buildAdvancedTree(r)
    dd = r.getChildNodesByClass(DefinitionDescription)
    print "DD:", dd
    for c in dd:
        assert c.indentlevel == 1
    assert len(dd) == 4