def test_table_not_eating(): """internal parser error. """ uparser.simpleparse("""{|) |10<sup>10<sup>100</sup></sup>||gsdfgsdfg |}""")
def test_table_not_eating2(): """internal parser error. """ uparser.simpleparse("""{| <tr><td>'''Birth name'''</td><td colspan="2">Alanis Nadine Morissette</td></tr><tr><td>'''Born'''</td> |} """)
def parse(self, text): sys_stdout = sys.stdout ast_str = StringIO.StringIO() sys.stdout = ast_str ast = simpleparse(text.decode('utf-8')) sys.stdout = sys_stdout self.parse_node(ast)
def parse_smwtable_wikitext(self, wikitext): """wikitext must just be the smwtable format. returns back an array of records""" parsed = uparser.simpleparse(wikitext) results = parsed.find(parser.Table) if len(results) == 0: return [] table = results[0] keys = [] for col in table.children[0].children: keys.append(col.asText().strip()) records = [] for row in itertools.islice(table.children, 1, None): data = [] # 3 cols in each row for col in row.children: data.append(col.asText().strip()) zipped = zip(keys, data) data_dict = dict(zipped) records.append(data_dict) return records
def exportUrl(pages="", opts=Opts()): req = urllib2.Request(opts.WIKI +"index.php", data="title=Special:Export&pages={0}".format(pages)) res = urllib2.urlopen(req); body = doc = etree.fromstring(body) pagetitle = doc.xpath("//mw:mediawiki/mw:page/mw:title/text()", namespaces=namespaces)[0] revisions = doc.xpath("//mw:mediawiki/*/mw:revision", namespaces=namespaces) for rev in revisions: version = rev.xpath("mw:id/text()", namespaces=namespaces)[0] tstamp = rev.xpath("mw:timestamp/text()", namespaces=namespaces)[0] content = rev.xpath("mw:text/text()", namespaces=namespaces)[0] contrib = rev.xpath("mw:contributor/mw:username/text()", namespaces=namespaces)[0] struct = simpleparse(content) articleUrls = fetchUrls(struct) envelopeList = [] for url in articleUrls.keys(): recordId = "{0}-{1}".format(version, hashlib.sha1(url).hexdigest()) refUrl = "{0}index.php/{1}".format(opts.WIKI, pages) paradata = template.format(recordId, pagetitle, refUrl, url, articleUrls[url]) envelope = getEnvelope(recordId, url, tstamp, contrib, paradata) #print "{0} count: {1} sha: {2}\n".format(url, articleUrls[url], recordId) print json.dumps(envelope) envelopeList.append(envelope) bulkUpdate(envelopeList, opts)
def test_pull_in_styletags_1(): s='<b> one\n\n== two ==\n\nthree\n</b>\nfour\n' r=uparser.simpleparse(s) styles = r.find(parser.Style) txt = " ".join(x.asText() for x in styles) assert "one" in txt assert "two" in txt assert "three" in txt assert "four" not in txt
def test_pull_in_styletags_1(): s = '<b> one\n\n== two ==\n\nthree\n</b>\nfour\n' r = uparser.simpleparse(s) styles = r.find(parser.Style) txt = " ".join(x.asText() for x in styles) assert "one" in txt assert "two" in txt assert "three" in txt assert "four" not in txt
def wikitext(self): from mwlib.uparser import simpleparse from htmlwriter import HTMLWriter from StringIO import StringIO out = StringIO() print "Unwikified text: %s" % (self.text) w = HTMLWriter(out) w.write(simpleparse(self.text)) return out.getvalue()
def display_mediawiki(request, path, text): article = simpleparse(text) try: # Try to figure out a title by looking up svn properties caption = client.propget("svnweb:mediawiki:title", path).values()[0] except: caption = None if caption is None: # No explicit name, so we'll derive it from the filename caption = path.rpartition("/")[2] if "." in caption: caption = caption.rpartition(".")[0] article.caption = caption writer = MWXHTMLWriter() writer.xwriteStyle = mediawiki_xwriteStyle writer.xwriteArticleLink = mediawiki_xwriteArticleLink element = writer.write(article) result = ElementTree.tostring(element) result = """ <html><head><title>%s</title> <style type="text/css"> body {font-size: 13px; font-family: sans-serif; padding: 30px; background-color: #f9f9f9} .content {border: 1px solid #aaa; padding: 10px; padding-top: 0px; background-color: white} .content div[class~="mwx.paragraph"] {margin-bottom: 12px} .content > div > h1 {font-size: 24px; width: 100%%; border-bottom: 1px solid #aaa; margin-bottom: 12px; margin-top: 8px} .content > div > div > h2 {font-size: 19px; width: 100%%; border-bottom: 1px solid #aaa; margin-bottom: 8px} .content > div > div > div > h2 {font-size: 17px; margin-bottom: 7px; margin-top: 18px} .content > div > div > div > div > h2 {font-size: 15px; margin-bottom: 4px} .content > div > div > div > div > div > h2 {font-size: 13px; margin-bottom: 4px} span[class~="mwx.svnweb.bold"] {font-weight: bold} span[class~="mwx.svnweb.italic"] {font-style: italic} </style> </head><body> <div class="content"> """ % caption + result + """ </div></body></html> """ return "text/html", result
#!/usr/bin/env python from mwlib.uparser import parseString, simpleparse from BeautifulSoup import BeautifulStoneSoup from xml.sax.saxutils import unescape from mwlib.parser.nodes import * import fileinput import re def replace_nested(regex, text): while True: original = text text = re.sub(regex, ' ', text) # print "text2 ", text[0:1000].encode('utf-8') if original == text: return text for line in fileinput.input(): xml = BeautifulStoneSoup(line) text = xml.find('text').string text = unescape(text, {"'": "'", """: '"'}) text = replace_nested('{[^{]*?}', text) simpleparse(text)
from mwlib.uparser import simpleparse, parseString #?simpleparse _ip.system("bash") _ip.system("/bin/ls --color=auto ") testfiles=('mwtable-01.txt','mwtable-02.txt') import mwlib testtxt = file(testfiles[0],'r').read() _ip.magic("pwd ") _ip.system("/bin/ls --color=auto ") testfiles=('mwtable-test01.txt','mwtable-test02.txt') testfiles=('mwtable-test01.txt','mwtable-test02.txt') testtxt = file(testfiles[0],'r').read() testtxt from mwlib.uparser import simpleparse, parseString testtxt simpleparse(testtxt) testtxt type(testtxt( )) type(testtxt) unicode(testtxt) unicode(testtxt,replace='replace') #?unicode #?unicode.decode unicode.decode(testtxt,errors='replace') str.decode(testtxt,errors='replace') #?str.decode testtxt.decode('UTF8',errors='replace') testtxt.decode('UTF8','replace') type(testtxt.decode('UTF8','replace')) utesttxt=testtxt.decode('UTF8','replace')