def __init__(self, *args, **kwargs): MWXHTMLWriter.__init__(self, *args, **kwargs) #keep reference list for each group serparate self.references = defaultdict(list) #also keep named reference positions, separate for each group #map named reference to 2-tuple of position first seen and count self.namedrefs = defaultdict(dict)
def getXHTML(wikitext): db = DummyDB() r = parseString(title="", raw=wikitext, wikidb=db) preprocess(r) dbw = MWXHTMLWriter() with SuppressOutput(): dbw.writeBook(r) return dbw.asstring()
def to_html(cls, kb_entry): r = kb_entry.body.replace("\r", "") parsed = parseString(title=kb_entry.subject, raw=r, wikidb=cls.NOCDB(kb_entry)) preprocess(parsed) xhtml = MWXHTMLWriter() xhtml.writeBook(parsed) block = ET.tostring(xhtml.xmlbody) return block
def __init__(self, filters, env=None, status_callback=None,imagesrcresolver=None, debug=False): self.filters = filters MWXHTMLWriter.__init__(self, env, status_callback, imagesrcresolver, debug) #keep reference list for each group serparate self.references = defaultdict(list) #also keep named reference positions, separate for each group #map named reference to 2-tuple of position first seen and count self.namedrefs = defaultdict(dict)
def getXHTML(wikitext): db = DummyDB() r = parseString(title="test", raw=wikitext, wikidb=db) preprocess(r) show(sys.stdout, r) dbw = MWXHTMLWriter() dbw.writeBook(r) return dbw.asstring()
def __init__(self, filters, skip_refs=False, env=None, status_callback=None,imagesrcresolver=None, debug=False): #self.filters = filters self.exclude_classes = set(filters.get('EXCLUDE_CLASSES', ())) self.exclude_ids = set(filters.get('EXCLUDE_IDS', ())) MWXHTMLWriter.__init__(self, env, status_callback, imagesrcresolver, debug) #keep reference list for each group serparate self.references = defaultdict(list) #also keep named reference positions, separate for each group #map named reference to 2-tuple of position first seen and count self.namedrefs = defaultdict(dict) self.skip_refs = skip_refs
def getXHTML(wikitext, title, language): db = DummyDB() db.normalize_and_get_page = noop r = parseString(title=title, raw=wikitext, wikidb=db, lang=language) if not r: return None preprocess(r) removeLangLinks(r) dbw = MWXHTMLWriter() dbw.writeBook(r) return dbw.asstring()
def display_mediawiki(request, path, text): article = simpleparse(text) try: # Try to figure out a title by looking up svn properties caption = client.propget("svnweb:mediawiki:title", path).values()[0] except: caption = None if caption is None: # No explicit name, so we'll derive it from the filename caption = path.rpartition("/")[2] if "." in caption: caption = caption.rpartition(".")[0] article.caption = caption writer = MWXHTMLWriter() writer.xwriteStyle = mediawiki_xwriteStyle writer.xwriteArticleLink = mediawiki_xwriteArticleLink element = writer.write(article) result = ElementTree.tostring(element) result = """ <html><head><title>%s</title> <style type="text/css"> body {font-size: 13px; font-family: sans-serif; padding: 30px; background-color: #f9f9f9} .content {border: 1px solid #aaa; padding: 10px; padding-top: 0px; background-color: white} .content div[class~="mwx.paragraph"] {margin-bottom: 12px} .content > div > h1 {font-size: 24px; width: 100%%; border-bottom: 1px solid #aaa; margin-bottom: 12px; margin-top: 8px} .content > div > div > h2 {font-size: 19px; width: 100%%; border-bottom: 1px solid #aaa; margin-bottom: 8px} .content > div > div > div > h2 {font-size: 17px; margin-bottom: 7px; margin-top: 18px} .content > div > div > div > div > h2 {font-size: 15px; margin-bottom: 4px} .content > div > div > div > div > div > h2 {font-size: 13px; margin-bottom: 4px} span[class~="mwx.svnweb.bold"] {font-weight: bold} span[class~="mwx.svnweb.italic"] {font-style: italic} </style> </head><body> <div class="content"> """ % caption + result + """ </div></body></html> """ return "text/html", result
def run(self): raw = u'\n'.join(self.content) # empty wikidb db = DummyDB() # run parser and pre-processors parsed = parseString(title='Export', raw=raw, wikidb=db) preprocess(parsed) # write XHTML xhtml = MWXHTMLWriter() xhtml.writeBook(parsed) # remove the H1 heading (title) from the document article = xhtml.xmlbody.getchildren()[0] article.remove(article.getchildren()[0]) # remove caption # render to string block = ET.tostring(xhtml.xmlbody) return [nodes.raw('', block, format='html')]
def to_html(cls, kb_entry): from mwlib.uparser import parseString from mwlib.xhtmlwriter import MWXHTMLWriter, preprocess try: import xml.etree.ElementTree as ET except: from elementtree import ElementTree as ET r = kb_entry.body.replace("\r", "") parsed = parseString(title=kb_entry.subject, raw=r, wikidb=cls.NOCDB(kb_entry)) preprocess(parsed) xhtml = MWXHTMLWriter() xhtml.writeBook(parsed) block = ET.tostring(xhtml.xmlbody) return block
def xwriteTable(self, obj): tableclasses = obj.attributes.get('class', '').split() if any((tableclass in self.exclude_classes for tableclass in tableclasses)): return SkipChildren() id_attr = obj.attributes.get('id', '') if id_attr in self.exclude_ids: return SkipChildren() return MWXHTMLWriter.xwriteTable(self, obj)
def xwriteGenericElement(self, obj): classes = obj.attributes.get('class', '').split() if any((cl in EXCLUDE_CLASSES for cl in classes)): return SkipChildren() id_attr = obj.attributes.get('id', '') if id_attr in EXCLUDED_IDS: return SkipChildren() return MWXHTMLWriter.xwriteGenericElement(self, obj)
def xwriteTable(self, obj): tableclasses = obj.attributes.get('class', '').split() if any((tableclass in EXCLUDE_CLASSES for tableclass in tableclasses)): return SkipChildren() id_attr = obj.attributes.get('id', '') if id_attr in EXCLUDED_IDS: return SkipChildren() return MWXHTMLWriter.xwriteTable(self, obj)
def xwriteGenericElement(self, obj): classes = obj.attributes.get('class', '').split() if any((genericclass in self.exclude_classes for genericclass in classes)): return SkipChildren() id_attr = obj.attributes.get('id', '') if id_attr in self.exclude_ids: return SkipChildren() return MWXHTMLWriter.xwriteGenericElement(self, obj)
def xwriteTable(self, obj): tableclasses = obj.attributes.get('class', '').split() if ( len(self.filters['EXCLUDE_CLASSES']) > 0 and any((tableclass in self.filters['EXCLUDE_CLASSES'] for tableclass in tableclasses))): return SkipChildren() id_attr = obj.attributes.get('id', '') if ( len(self.filters['EXCLUDE_IDS']) > 0 and id_attr in self.filters['EXCLUDE_IDS']): return SkipChildren() return MWXHTMLWriter.xwriteTable(self, obj)
def xwriteGenericElement(self, obj): classes = obj.attributes.get('class', '').split() if ( len(self.filters['EXCLUDE_CLASSES']) > 0 and any((genericclass in self.filters['EXCLUDE_CLASSES'] for genericclass in classes))): return SkipChildren() id_attr = obj.attributes.get('id', '') if ( len(self.filters['EXCLUDE_IDS']) > 0 and id_attr in self.filters['EXCLUDE_IDS']): return SkipChildren() return MWXHTMLWriter.xwriteGenericElement(self, obj)
def __init__(self, filters, skip_refs=False, env=None, status_callback=None, imagesrcresolver=None, debug=False): #self.filters = filters self.exclude_classes = set(filters.get('EXCLUDE_CLASSES', ())) self.exclude_ids = set(filters.get('EXCLUDE_IDS', ())) MWXHTMLWriter.__init__(self, env, status_callback, imagesrcresolver, debug) #keep reference list for each group serparate self.references = defaultdict(list) #also keep named reference positions, separate for each group #map named reference to 2-tuple of position first seen and count self.namedrefs = defaultdict(dict) self.skip_refs = skip_refs
def parseENwikt(): wiktionaryGet.getWiktionaries(['en']) fh = bz2.BZ2File("enwiktionary-latest-pages-meta-current.xml.bz2") bg_en = {} en_bg = {} debug = False if debug: try: from IPython.Shell import IPShellEmbed ipshell = IPShellEmbed() except: from IPython import embed ipshell = embed cyrlRE = re.compile(ur'[\u0400-\u04FF\u0500-\u052F]', re.UNICODE) bulRE = re.compile("[bB]ulgarian", re.UNICODE) bulgarianSingle = re.compile("\* [bB]ulgarian", re.UNICODE) bulgarianSectionStart = re.compile("^==Bulgarian==$", re.UNICODE) bulgarianSectionEnd = re.compile("^==[A-Za-z-]+==$", re.UNICODE) keep = False read = False w = MWXHTMLWriter() while 1: line = fh.readline() if not line: break if line == " <page>\n": article = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" read = True elif line == " </page>\n": read = False if keep: keep = False article += line root = xml.dom.minidom.parseString(article) if len(root.getElementsByTagName("text")[0].childNodes) > 0: title = root.getElementsByTagName( "title")[0].firstChild.data text = root.getElementsByTagName("text")[0].firstChild.data newText = "" Bulg = False for line in text.split('\n'): if bulgarianSectionStart.search(line): Bulg = True elif bulgarianSectionEnd.search(line): Bulg = False if Bulg == True: newText += line + '\n' elif bulgarianSingle.search(line): newText += line + '\n' if newText is not "": p = parseString(title, newText) if cyrlRE.search(title): if debug: print "bg_en = " + newText.encode('utf-8') ipshell() bg_en[title] = ''.join( ET.tostring(w.write(p), encoding="utf-8", method="html").split('\n')) else: if debug: print "en_bg = " + newText.encode('utf-8') ipshell() en_bg[title] = ''.join( ET.tostring(w.write(p), encoding="utf-8", method="html").split('\n')) if read: if bulRE.search(line): keep = True article += line enWiktBG = bz2.BZ2File("enWiktBG.pickle.bz2", 'wb') pickle.dump((bg_en, en_bg), enWiktBG, pickle.HIGHEST_PROTOCOL) enWiktBG.close()
def __init__(self, **kargs): #MWXHTMLWriter.__init__(self, **kargs) MWXHTMLWriter.__init__(self, None, None, "images/IMAGENAME", False) self.root = self.xmlbody = ET.Element("body") self.paratag = "p"
def parseENwikt(): wiktionaryGet.getWiktionaries(['en']) fh = bz2.BZ2File("enwiktionary-latest-pages-meta-current.xml.bz2") bg_en = {} en_bg = {} debug = False if debug: try: from IPython.Shell import IPShellEmbed ipshell = IPShellEmbed() except: from IPython import embed ipshell = embed cyrlRE = re.compile(ur'[\u0400-\u04FF\u0500-\u052F]', re.UNICODE) bulRE = re.compile("[bB]ulgarian", re.UNICODE) bulgarianSingle = re.compile("\* [bB]ulgarian", re.UNICODE) bulgarianSectionStart = re.compile("^==Bulgarian==$", re.UNICODE) bulgarianSectionEnd = re.compile("^==[A-Za-z-]+==$", re.UNICODE) keep = False read = False w = MWXHTMLWriter() while 1: line = fh.readline() if not line: break if line == " <page>\n": article = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" read = True elif line == " </page>\n": read = False if keep: keep = False article += line root = xml.dom.minidom.parseString(article) if len(root.getElementsByTagName("text")[0].childNodes) > 0: title = root.getElementsByTagName("title")[0].firstChild.data text = root.getElementsByTagName("text")[0].firstChild.data newText = "" Bulg = False for line in text.split('\n'): if bulgarianSectionStart.search(line): Bulg = True elif bulgarianSectionEnd.search(line): Bulg = False if Bulg == True: newText += line + '\n' elif bulgarianSingle.search(line): newText += line + '\n' if newText is not "": p = parseString(title, newText) if cyrlRE.search(title): if debug: print "bg_en = " + newText.encode('utf-8') ipshell() bg_en[title] = ''.join(ET.tostring(w.write(p), encoding = "utf-8", method = "html").split('\n')) else: if debug: print "en_bg = " + newText.encode('utf-8') ipshell() en_bg[title] = ''.join(ET.tostring(w.write(p), encoding = "utf-8", method = "html").split('\n')) if read: if bulRE.search(line): keep = True article += line enWiktBG = bz2.BZ2File("enWiktBG.pickle.bz2", 'wb') pickle.dump((bg_en, en_bg), enWiktBG, pickle.HIGHEST_PROTOCOL) enWiktBG.close()