def test_identity(): raw = """ <br/> <br/> <br/> <br/> <br/> <br/> <br/> <br/> """.decode("utf8") db = DummyDB() r = parseString(title="X33", raw=raw, wikidb=db) buildAdvancedTree(r) _treesanity(r) brs = r.getChildNodesByClass(BreakingReturn) for i, br in enumerate(brs): assert br in br.siblings assert i == _idIndex(br.parent.children, br) assert len([x for x in br.parent.children if x is not br]) == len(brs) - 1 for bbr in brs: if br is bbr: continue assert br == bbr assert br is not bbr
def cleanMarkupSingle(raw, cleanerMethod): tree = getTreeFromMarkup(raw) buildAdvancedTree(tree) tc = TreeCleaner(tree, save_reports=True) tc.clean([cleanerMethod]) reports = tc.getReports() return (tree, reports)
def test_copy(): raw = """ ===[[Leuchtturm|Leuchttürme]] auf Fehmarn=== *[[Leuchtturm Flügge]] super da *[[Leuchtturm Marienleuchte]] da auch *[[Leuchtturm Strukkamphuk]] spitze *[[Leuchtturm Staberhuk]] supi *[[Leuchtturm Westermarkelsdorf]] """.decode( "utf8" ) db = DummyDB() r = parseString(title="X33", raw=raw, wikidb=db) buildAdvancedTree(r) c = r.copy() _treesanity(c) def _check(n1, n2): assert n1.caption == n2.caption assert n1.__class__ == n2.__class__ assert len(n1.children) == len(n2.children) for i, c1 in enumerate(n1): _check(c1, n2.children[i]) _check(r, c)
def setup(): # WARNING, ALTERING THIS'll PROPABLY BREAK ALL TESTS! EDIT WITH CARE t = [Article(), [Section(), [PreFormatted(), [Text("bla blub"), ImageLink()]]], [Table(), [Row(), [Cell(), PreFormatted(), [Text("jo")]], ImageLink()]], [Section(), [Section(), [Strong()]]], [Text("bla")], ] # WARNING, ALTERING THE ABOVE PROPABLY BREAK ALL TESTS! EDIT WITH CARE def rec(elements, parent): last = None for c in elements: if isinstance(c, type([])): assert last rec(c, last) else: if parent: parent.children.append(c) last = c rec(t, None) t = t[0] buildAdvancedTree(t) #import mwlib.parser, sys; mwlib.parser.show(sys.stderr, t, 0) return t
def test_ulist(): """http://code.pediapress.com/wiki/ticket/222""" raw = u""" * A item *: B Previous item continues. """ r = parseString(title='t', raw=raw) buildAdvancedTree(r) # parser.show(sys.stdout, r) assert len(r.getChildNodesByClass(Item)) == 1
def purify_string(self, string, title='__no_name__'): """ parses the string and returns it as a list of PureSections """ markup = self.templateactions.handle_templates(string, title=title) tree = myParseString.myParseString(title=title, raw=markup, wikidb=self.env.wiki, lang=self.env.wiki.siteinfo["general"]["lang"], uniq=self.templateactions.exp.uniquifier) advtree.buildAdvancedTree(tree) return self.purify(tree)
def getXML(wikitext): db = DummyDB() r = parseString(title="test", raw=wikitext, wikidb=db) advtree.buildAdvancedTree(r) preprocess(r) mwlib.parser.show(sys.stdout, r) odfw = ODFWriter() odfw.writeTest(r) validate(odfw) xml = odfw.asstring() # print xml # usefull to inspect generateded xml return xml
def preprocess(root): #advtree.buildAdvancedTree(root) #xmltreecleaner.removeChildlessNodes(root) #xmltreecleaner.fixLists(root) #xmltreecleaner.fixParagraphs(root) #xmltreecleaner.fixBlockElements(root) #print"*** parser raw "*5 #parser.show(sys.stdout, root) #print"*** new TreeCleaner "*5 advtree.buildAdvancedTree(root) tc = TreeCleaner(root) tc.cleanAll()
def renderMW(txt, filesuffix=None): parseTree = uparser.parseString(title='Test', raw=txt) advtree.buildAdvancedTree(parseTree) tc = TreeCleaner(parseTree) tc.cleanAll() tmpdir = tempfile.mkdtemp() rw = RlWriter(test_mode=True) rw.wikiTitle = 'testwiki' rw.tmpdir = tmpdir rw.imgDB = dummyImageDB(basedir=tmpdir) elements = rw.write(parseTree) renderElements(elements, filesuffix, tmpdir)
def test_attributes(): t1 = ''' {| |- STYLE="BACKGROUND:#FFDEAD;" |stuff |} ''' r = parseString(title='t', raw=t1) buildAdvancedTree(r) n = r.getChildNodesByClass(Row)[0] print n.attributes, n.style assert isinstance(n.style, dict) assert isinstance(n.attributes, dict) assert n.style["background"] == "#FFDEAD"
def _servXML(self, args, query, dialect="mwxml"): if not len(args): self._doc(error="require articlename") return unknown = [k for k in query if k not in ("debug", "imageresolver")] if unknown: return self._doc(error="unknown option %r" % unknown) title = args.pop() base_url = "http://%s/" % ("/".join(args) or default_baseurl) debug = bool(query.setdefault("debug", [default_debug])[0]) language = "en" # FIXME namespace="en.wikipedia.org" # FIXME print "_servXML", title, base_url, debug db = mwapidb.WikiDB(base_url) db.print_template = None # deactivate print template lookups tree = db.getParsedArticle(title, revision=None) if dialect == "mwxhtml": xhtmlwriter.preprocess(tree) dbw = xhtmlwriter.MWXHTMLWriter(imagesrcresolver=imagesrcresolver, debug=False) elif dialect == "mwxml": advtree.buildAdvancedTree(tree) # this should be optional dbw = xhtmlwriter.MWXMLWriter() # 1:1 XML from parse tree elif dialect == "dbxml": from mwlib import docbookwriter docbookwriter.preprocess(tree) dbw = docbookwriter.DocBookWriter(imagesrcresolver=imagesrcresolver, debug=debug) else: raise Exception, "unkonwn export" dbw.writeBook(tree) if debug: dbw.writeparsetree(tree) response = dbw.asstring() self.send_response(200) self.send_header("Content-type", "text/xml") self.send_header("Content-length", str(len(response))) self.end_headers() self.wfile.write(response) # shut down the connection self.wfile.flush()
def test_defintion_list(): """http://code.pediapress.com/wiki/ticket/221""" raw = u''';termA :descr1 ''' for i in range(2): r = parseString(title='t', raw=raw) buildAdvancedTree(r) dls = r.getChildNodesByClass(DefinitionList) assert len(dls) == 1 assert dls[0].getChildNodesByClass(DefinitionTerm) assert dls[0].getChildNodesByClass(DefinitionDescription) raw = raw.replace('\n', '')
def cleanMarkup(raw): print "Parsing %r" % (raw,) tree = getTreeFromMarkup(raw) print "before treecleaner: >>>" showTree(tree) print "<<<" print '=' * 20 buildAdvancedTree(tree) tc = TreeCleaner(tree, save_reports=True) tc.cleanAll(skipMethods=[]) reports = tc.getReports() print "after treecleaner: >>>" showTree(tree) print "<<<" return (tree, reports)
def test_definitiondescription(): raw = u""" == test == :One ::Two :::Three ::::Four """ db = DummyDB() r = parseString(title="t", raw=raw, wikidb=db) parser.show(sys.stdout, r) buildAdvancedTree(r) dd = r.getChildNodesByClass(DefinitionDescription) print "DD:", dd for c in dd: assert c.indentlevel == 1 assert len(dd) == 4
def worker(q, names, templr, noder): env = templr.env purifier = classify.Preprocessor(env, templr, noder) rm = nshandling.get_redirect_matcher(env.wiki.siteinfo, env.wiki.nshandler) articles = 0 skipped = 0 while True: try: name = names.get(True, 1) #only examine pages in the main namespace if env.wiki.nshandler.splitname(name)[0] == nshandling.NS_MAIN: raw = env.wiki.reader[name] #ignore redirects if not rm(raw): raw = templr.handle_templates(raw, title=name) tree = myParseString.myParseString(title=name, raw=raw, wikidb=env.wiki, lang=env.wiki.siteinfo["general"]["lang"], uniq=templr.exp.uniquifier) advtree.buildAdvancedTree(tree) #for all sections in each article for s in purifier.purify(tree): q.put(Record(s.heading, len(s.content))) articles += 1 else: #for skipping a redirect skipped += 1 else: #for skipping a page not in NS_MAIN skipped += 1 except Queue.Empty as excp: log.logger.info("examined " + str(articles) + " articles, skipped " + str(skipped)) q.put(None) return except Exception as excp: msg = name + u": " + unicode(excp) log.logger.error(msg.encode("utf-8", "ignore")) if DEBUG: log.logger.error(traceback.format_exc())
def test_removeNewlines(): # test no action within preformattet t = PreFormatted() text = u"\t \n\t\n\n \n\n" tn = Text(text) t.children.append(tn) buildAdvancedTree(t) _treesanity(t) assert tn.caption == text # tests remove node w/ whitespace only if at border t = Section() tn = Text(text) t.children.append(tn) buildAdvancedTree(t) _treesanity(t) #assert tn.caption == u"" assert not t.children # test remove newlines text = u"\t \n\t\n\n KEEP \n\n" t = Section() tn = Text(text) t.children.append(tn) buildAdvancedTree(t) _treesanity(t) assert tn.caption.count("\n") == 0 assert len(tn.caption) == len(text) assert t.children
def parse_and_purify(self, title, follow_redirects=False): """ parses the named article and returns it as a list of PureSections, returns None for redirects """ title = title.decode('utf-8') raw = self.env.wiki.reader[title] #check for redirect target = self.rm(raw) if target: if follow_redirects: logger.info(title + ' redirects to ' + target) return self.parse_and_purify(target) else: return None else: markup = self.templateactions.handle_templates(raw, title=title) tree = myParseString.myParseString(title=title, raw=markup, wikidb=self.env.wiki, lang=self.env.wiki.siteinfo["general"]["lang"], uniq=self.templateactions.exp.uniquifier) advtree.buildAdvancedTree(tree) return self.purify(tree)
def test_copy(): raw = """ ===[[Leuchtturm|Leuchttürme]] auf Fehmarn=== *[[Leuchtturm Flügge]] super da *[[Leuchtturm Marienleuchte]] da auch *[[Leuchtturm Strukkamphuk]] spitze *[[Leuchtturm Staberhuk]] supi *[[Leuchtturm Westermarkelsdorf]] """.decode("utf8") db = DummyDB() r = parseString(title="X33", raw=raw, wikidb=db) buildAdvancedTree(r) c = r.copy() _treesanity(c) def _check(n1, n2): assert n1.caption == n2.caption assert n1.__class__ == n2.__class__ assert len(n1.children) == len(n2.children) for i, c1 in enumerate(n1): _check(c1, n2.children[i]) _check(r, c)
def setup(): # WARNING, ALTERING THIS'll PROPABLY BREAK ALL TESTS! EDIT WITH CARE t = [ Article(), [Section(), [PreFormatted(), [Text("bla blub"), ImageLink()]]], [ Table(), [Row(), [Cell(), PreFormatted(), [Text("jo")]], ImageLink()] ], [Section(), [Section(), [Strong()]]], [Text("bla")], ] # WARNING, ALTERING THE ABOVE PROPABLY BREAK ALL TESTS! EDIT WITH CARE def rec(elements, parent): last = None for c in elements: if isinstance(c, type([])): assert last rec(c, last) else: if parent: parent.children.append(c) last = c rec(t, None) t = t[0] buildAdvancedTree(t) #import mwlib.parser, sys; mwlib.parser.show(sys.stderr, t, 0) return t
def test_colspan(): raw = '''<table><tr><td colspan="bogus">no colspan </td></tr></table>''' r = parseString(title='t', raw=raw) buildAdvancedTree(r) assert r.getChildNodesByClass(Cell)[0].colspan is 1 raw = '''<table><tr><td colspan="-1">no colspan </td></tr></table>''' r = parseString(title='t', raw=raw) buildAdvancedTree(r) assert r.getChildNodesByClass(Cell)[0].colspan is 1 raw = '''<table><tr><td colspan="2">colspan1</td></tr></table>''' r = parseString(title='t', raw=raw) buildAdvancedTree(r) assert r.getChildNodesByClass(Cell)[0].colspan is 2
def preprocess(root): advtree.buildAdvancedTree(root) tc = TreeCleaner(root) tc.cleanAll()
#! /usr/bin/env python # -*- coding: utf-8 -*- import sys import time s = unicode(open(sys.argv[1], "rb").read(), "utf-8") from mwlib import uparser, advtree, treecleaner from mwlib.refine import compat stime = time.time() r = compat.parse_txt(s) print "parse:", time.time() - stime stime = time.time() advtree.buildAdvancedTree(r) print "tree", time.time() - stime stime = time.time() tc = treecleaner.TreeCleaner(r) tc.cleanAll() print "clean:", time.time() - stime
def buildAdvTree(raw): tree = getTreeFromMarkup(raw) advtree.buildAdvancedTree(tree) tc = TreeCleaner(tree, save_reports=True) tc.cleanAll(skipMethods=[]) return tree
def getAdvTree(raw): tree = parseString(title='test', raw=raw) buildAdvancedTree(tree) return tree
def getTreeFromMarkup(raw): tree = parseString(title="Test", raw=raw, wikidb=DummyDB()) advtree.buildAdvancedTree(tree) return tree
def preprocess(root): advtree.buildAdvancedTree(root) xmltreecleaner.removeChildlessNodes(root) xmltreecleaner.fixLists(root) xmltreecleaner.fixParagraphs(root) xmltreecleaner.fixBlockElements(root)
{{Harv|Smith|2006| p=25}} "Cleanup" expands to "Asbox" that should be removed<br /> {{Cleanup}} <h2>Markup tests</h2> I wonder what happens to the heading above... == Wiki style header == As opposed to this one """ name = "Test" if args.file: name = args.article markup = util.file2s(args.article) else: page = env.wiki.get_page(args.article) raw = page.rawtext name = page.names[-1] markup = act.handle_templates(raw, title=name) tree = myParseString.myParseString(title=name, raw=markup, wikidb=env.wiki, lang=env.wiki.siteinfo["general"]["lang"], uniq=act.exp.uniquifier) advtree.buildAdvancedTree(tree) sections = purifier.purify(tree) for s in sections: print s