Example #1
0
def test_table_not_eating():
    """internal parser error.
    http://code.pediapress.com/wiki/ticket/32
    http://code.pediapress.com/wiki/ticket/29
"""
    uparser.simpleparse("""{|)
|10<sup>10<sup>100</sup></sup>||gsdfgsdfg
|}""")
Example #2
0
def test_table_not_eating():
    """internal parser error.
    http://code.pediapress.com/wiki/ticket/32
    http://code.pediapress.com/wiki/ticket/29    
"""
    uparser.simpleparse("""{|)
|10<sup>10<sup>100</sup></sup>||gsdfgsdfg
|}""")
Example #3
0
def test_table_not_eating2():
    """internal parser error.
    http://code.pediapress.com/wiki/ticket/32
    http://code.pediapress.com/wiki/ticket/29    
"""
    uparser.simpleparse("""{| 
<tr><td>'''Birth&nbsp;name'''</td><td colspan="2">Alanis Nadine Morissette</td></tr><tr><td>'''Born'''</td>
|}
""")
Example #4
0
def test_table_not_eating2():
    """internal parser error.
    http://code.pediapress.com/wiki/ticket/32
    http://code.pediapress.com/wiki/ticket/29
"""
    uparser.simpleparse("""{|
<tr><td>'''Birth&nbsp;name'''</td><td colspan="2">Alanis Nadine Morissette</td></tr><tr><td>'''Born'''</td>
|}
""")
 def parse(self, text):
   sys_stdout = sys.stdout
   ast_str = StringIO.StringIO()
   sys.stdout = ast_str
   ast = simpleparse(text.decode('utf-8'))
   sys.stdout = sys_stdout
   self.parse_node(ast)
Example #6
0
    def parse_smwtable_wikitext(self, wikitext):
        """wikitext must just be the smwtable format. returns back an array of records"""

        parsed = uparser.simpleparse(wikitext)

        results = parsed.find(parser.Table)
        if len(results) == 0:
            return []

        table = results[0]

        keys = []
        for col in table.children[0].children:
            keys.append(col.asText().strip())

        records = []
        for row in itertools.islice(table.children, 1, None):
            data = []
            # 3 cols in each row
            for col in row.children:
                data.append(col.asText().strip())

            zipped = zip(keys, data)
            data_dict = dict(zipped)
            records.append(data_dict)

        return records
Example #7
0
 def parse(self, text):
     sys_stdout = sys.stdout
     ast_str = StringIO.StringIO()
     sys.stdout = ast_str
     ast = simpleparse(text.decode('utf-8'))
     sys.stdout = sys_stdout
     self.parse_node(ast)
Example #8
0
    def parse_smwtable_wikitext(self, wikitext):
        """wikitext must just be the smwtable format. returns back an array of records"""

        parsed = uparser.simpleparse(wikitext)

        results = parsed.find(parser.Table)
        if len(results) == 0:
            return []

        table = results[0]

        keys = []
        for col in table.children[0].children:
            keys.append(col.asText().strip())

        records = []
        for row in itertools.islice(table.children, 1, None):
            data = []
            # 3 cols in each row
            for col in row.children:
                data.append(col.asText().strip())

            zipped = zip(keys, data)
            data_dict = dict(zipped)
            records.append(data_dict)

        return records
def exportUrl(pages="", opts=Opts()):
    req = urllib2.Request(opts.WIKI +"index.php", data="title=Special:Export&pages={0}".format(pages)) 
    res = urllib2.urlopen(req);
    body = res.read()
    
    doc = etree.fromstring(body)
    pagetitle = doc.xpath("//mw:mediawiki/mw:page/mw:title/text()", namespaces=namespaces)[0]
    revisions = doc.xpath("//mw:mediawiki/*/mw:revision", namespaces=namespaces)
    for rev in revisions:
        version = rev.xpath("mw:id/text()", namespaces=namespaces)[0]
        tstamp = rev.xpath("mw:timestamp/text()", namespaces=namespaces)[0]
        content = rev.xpath("mw:text/text()", namespaces=namespaces)[0]
        contrib = rev.xpath("mw:contributor/mw:username/text()", namespaces=namespaces)[0]
        
        struct = simpleparse(content)
        
        articleUrls = fetchUrls(struct)
        
        envelopeList = []
        for url in articleUrls.keys():
            recordId = "{0}-{1}".format(version, hashlib.sha1(url).hexdigest()) 
            refUrl = "{0}index.php/{1}".format(opts.WIKI, pages)
            paradata = template.format(recordId, pagetitle, refUrl, url, articleUrls[url])
            envelope = getEnvelope(recordId, url, tstamp, contrib, paradata)
            #print "{0} count: {1} sha: {2}\n".format(url, articleUrls[url], recordId)
            print json.dumps(envelope)
            envelopeList.append(envelope)
            
            
        
        
        bulkUpdate(envelopeList, opts)
Example #10
0
def test_pull_in_styletags_1():
    s='<b> one\n\n== two ==\n\nthree\n</b>\nfour\n'
    r=uparser.simpleparse(s)
    styles = r.find(parser.Style)
    txt = " ".join(x.asText() for x in styles)
    assert "one" in txt
    assert "two" in txt
    assert "three" in txt
    assert "four" not in txt
Example #11
0
def test_pull_in_styletags_1():
    s = '<b> one\n\n== two ==\n\nthree\n</b>\nfour\n'
    r = uparser.simpleparse(s)
    styles = r.find(parser.Style)
    txt = " ".join(x.asText() for x in styles)
    assert "one" in txt
    assert "two" in txt
    assert "three" in txt
    assert "four" not in txt
Example #12
0
	def wikitext(self):
		from mwlib.uparser import simpleparse
		from htmlwriter import HTMLWriter
		from StringIO import StringIO
		out = StringIO()
		print "Unwikified text: %s" % (self.text)
		w = HTMLWriter(out)
		w.write(simpleparse(self.text))
		return out.getvalue()
Example #13
0
def display_mediawiki(request, path, text):
    article = simpleparse(text)
    try:
        # Try to figure out a title by looking up svn properties
        caption = client.propget("svnweb:mediawiki:title", path).values()[0]
    except:
        caption = None
    if caption is None: # No explicit name, so we'll derive it from the filename
        caption = path.rpartition("/")[2]
        if "." in caption:
            caption = caption.rpartition(".")[0]
    article.caption = caption
    writer = MWXHTMLWriter()
    writer.xwriteStyle = mediawiki_xwriteStyle
    writer.xwriteArticleLink = mediawiki_xwriteArticleLink
    element = writer.write(article)
    result = ElementTree.tostring(element)
    result = """
    <html><head><title>%s</title>
    
    <style type="text/css">
    
    body {font-size: 13px; font-family: sans-serif; padding: 30px;
          background-color: #f9f9f9}
    
    .content {border: 1px solid #aaa; padding: 10px; padding-top: 0px;
              background-color: white}
    
    .content div[class~="mwx.paragraph"] {margin-bottom: 12px}
    
    .content > div > h1 {font-size: 24px; width: 100%%;
                         border-bottom: 1px solid #aaa; margin-bottom: 12px;
                         margin-top: 8px}
    
    .content > div > div > h2 {font-size: 19px; width: 100%%; 
                           border-bottom: 1px solid #aaa;
                           margin-bottom: 8px}
    .content > div > div > div > h2 {font-size: 17px; margin-bottom: 7px;
                                     margin-top: 18px}
    .content > div > div > div > div > h2 {font-size: 15px; margin-bottom: 4px}
    .content > div > div > div > div > div > h2 {font-size: 13px; margin-bottom: 4px}
    
    span[class~="mwx.svnweb.bold"] {font-weight: bold}
    span[class~="mwx.svnweb.italic"] {font-style: italic}
    
    </style>
    
    </head><body>
    <div class="content">
    """ % caption + result + """
    </div></body></html>
    """
    return "text/html", result
#!/usr/bin/env python
from mwlib.uparser import parseString, simpleparse
from BeautifulSoup import BeautifulStoneSoup
from xml.sax.saxutils import unescape
from mwlib.parser.nodes import *
import fileinput
import re

def replace_nested(regex, text):
    while True:
        original = text
        text = re.sub(regex, ' ', text)
#        print "text2 ", text[0:1000].encode('utf-8')
        if original == text:
            return text

for line in fileinput.input():
    xml = BeautifulStoneSoup(line)
    text = xml.find('text').string
    text = unescape(text, {"&apos;": "'", "&quot;": '"'})
    text = replace_nested('{[^{]*?}', text)    
    simpleparse(text)
    




#!/usr/bin/env python
from mwlib.uparser import parseString, simpleparse
from BeautifulSoup import BeautifulStoneSoup
from xml.sax.saxutils import unescape
from mwlib.parser.nodes import *
import fileinput
import re


def replace_nested(regex, text):
    while True:
        original = text
        text = re.sub(regex, ' ', text)
        #        print "text2 ", text[0:1000].encode('utf-8')
        if original == text:
            return text


for line in fileinput.input():
    xml = BeautifulStoneSoup(line)
    text = xml.find('text').string
    text = unescape(text, {"&apos;": "'", "&quot;": '"'})
    text = replace_nested('{[^{]*?}', text)
    simpleparse(text)
Example #16
0
from mwlib.uparser import simpleparse, parseString
#?simpleparse
_ip.system("bash")
_ip.system("/bin/ls --color=auto ")
testfiles=('mwtable-01.txt','mwtable-02.txt')
import mwlib
testtxt = file(testfiles[0],'r').read()
_ip.magic("pwd ")
_ip.system("/bin/ls --color=auto ")
testfiles=('mwtable-test01.txt','mwtable-test02.txt')
testfiles=('mwtable-test01.txt','mwtable-test02.txt')
testtxt = file(testfiles[0],'r').read()
testtxt
from mwlib.uparser import simpleparse, parseString
testtxt
simpleparse(testtxt)
testtxt
type(testtxt(
))
type(testtxt)
unicode(testtxt)
unicode(testtxt,replace='replace')
#?unicode
#?unicode.decode
unicode.decode(testtxt,errors='replace')
str.decode(testtxt,errors='replace')
#?str.decode
testtxt.decode('UTF8',errors='replace')
testtxt.decode('UTF8','replace')
type(testtxt.decode('UTF8','replace'))
utesttxt=testtxt.decode('UTF8','replace')