Example #1
0
 def __init__(self, *args, **kwargs):
     MWXHTMLWriter.__init__(self, *args, **kwargs)
     #keep reference list for each group serparate
     self.references = defaultdict(list)
     #also keep named reference positions, separate for each group
     #map named reference to 2-tuple of position first seen and count
     self.namedrefs = defaultdict(dict)
Example #2
0
def getXHTML(wikitext):
    db = DummyDB()
    r = parseString(title="", raw=wikitext, wikidb=db)
    preprocess(r)
    dbw = MWXHTMLWriter()
    with SuppressOutput():
        dbw.writeBook(r)
    return dbw.asstring()
Example #3
0
 def to_html(cls, kb_entry):
     r = kb_entry.body.replace("\r", "")
     parsed = parseString(title=kb_entry.subject, raw=r, wikidb=cls.NOCDB(kb_entry))
     preprocess(parsed)
     xhtml = MWXHTMLWriter()
     xhtml.writeBook(parsed)
     block = ET.tostring(xhtml.xmlbody)
     return block
Example #4
0
 def __init__(self, filters, env=None, status_callback=None,imagesrcresolver=None, debug=False):
     self.filters = filters
     MWXHTMLWriter.__init__(self, env, status_callback, imagesrcresolver, debug)
     #keep reference list for each group serparate
     self.references = defaultdict(list)
     #also keep named reference positions, separate for each group
     #map named reference to 2-tuple of position first seen and count
     self.namedrefs = defaultdict(dict)
Example #5
0
def getXHTML(wikitext):
    db = DummyDB()
    r = parseString(title="test", raw=wikitext, wikidb=db)
    preprocess(r)
    show(sys.stdout, r)
    dbw = MWXHTMLWriter()
    dbw.writeBook(r)
    return dbw.asstring()
Example #6
0
 def __init__(self, filters, skip_refs=False, env=None, status_callback=None,imagesrcresolver=None, debug=False):
     #self.filters = filters
     self.exclude_classes = set(filters.get('EXCLUDE_CLASSES', ()))
     self.exclude_ids = set(filters.get('EXCLUDE_IDS', ()))
     MWXHTMLWriter.__init__(self, env, status_callback, imagesrcresolver, debug)
     #keep reference list for each group serparate
     self.references = defaultdict(list)
     #also keep named reference positions, separate for each group
     #map named reference to 2-tuple of position first seen and count
     self.namedrefs = defaultdict(dict)
     self.skip_refs = skip_refs
Example #7
0
def getXHTML(wikitext, title, language):
    db = DummyDB()
    db.normalize_and_get_page = noop
    r = parseString(title=title, raw=wikitext, wikidb=db, lang=language)
    if not r:
        return None
    preprocess(r)
    removeLangLinks(r)
    dbw = MWXHTMLWriter()
    dbw.writeBook(r)
    return dbw.asstring()
def getXHTML(wikitext, title, language):
    db = DummyDB()
    db.normalize_and_get_page = noop
    r = parseString(title=title, raw=wikitext, wikidb=db, lang=language)
    if not r:
        return None
    preprocess(r)
    removeLangLinks(r)
    dbw = MWXHTMLWriter()
    dbw.writeBook(r)
    return dbw.asstring()
Example #9
0
def display_mediawiki(request, path, text):
    article = simpleparse(text)
    try:
        # Try to figure out a title by looking up svn properties
        caption = client.propget("svnweb:mediawiki:title", path).values()[0]
    except:
        caption = None
    if caption is None: # No explicit name, so we'll derive it from the filename
        caption = path.rpartition("/")[2]
        if "." in caption:
            caption = caption.rpartition(".")[0]
    article.caption = caption
    writer = MWXHTMLWriter()
    writer.xwriteStyle = mediawiki_xwriteStyle
    writer.xwriteArticleLink = mediawiki_xwriteArticleLink
    element = writer.write(article)
    result = ElementTree.tostring(element)
    result = """
    <html><head><title>%s</title>
    
    <style type="text/css">
    
    body {font-size: 13px; font-family: sans-serif; padding: 30px;
          background-color: #f9f9f9}
    
    .content {border: 1px solid #aaa; padding: 10px; padding-top: 0px;
              background-color: white}
    
    .content div[class~="mwx.paragraph"] {margin-bottom: 12px}
    
    .content > div > h1 {font-size: 24px; width: 100%%;
                         border-bottom: 1px solid #aaa; margin-bottom: 12px;
                         margin-top: 8px}
    
    .content > div > div > h2 {font-size: 19px; width: 100%%; 
                           border-bottom: 1px solid #aaa;
                           margin-bottom: 8px}
    .content > div > div > div > h2 {font-size: 17px; margin-bottom: 7px;
                                     margin-top: 18px}
    .content > div > div > div > div > h2 {font-size: 15px; margin-bottom: 4px}
    .content > div > div > div > div > div > h2 {font-size: 13px; margin-bottom: 4px}
    
    span[class~="mwx.svnweb.bold"] {font-weight: bold}
    span[class~="mwx.svnweb.italic"] {font-style: italic}
    
    </style>
    
    </head><body>
    <div class="content">
    """ % caption + result + """
    </div></body></html>
    """
    return "text/html", result
Example #10
0
 def run(self):
     raw = u'\n'.join(self.content)
     # empty wikidb
     db = DummyDB()
     # run parser and pre-processors
     parsed = parseString(title='Export', raw=raw, wikidb=db)
     preprocess(parsed)
     # write XHTML
     xhtml = MWXHTMLWriter()
     xhtml.writeBook(parsed)
     # remove the H1 heading (title) from the document
     article = xhtml.xmlbody.getchildren()[0]
     article.remove(article.getchildren()[0]) # remove caption
     # render to string
     block = ET.tostring(xhtml.xmlbody)
     return [nodes.raw('', block, format='html')]
Example #11
0
 def to_html(cls, kb_entry):
     from mwlib.uparser import parseString
     from mwlib.xhtmlwriter import MWXHTMLWriter, preprocess
     try:
         import xml.etree.ElementTree as ET
     except:
         from elementtree import ElementTree as ET
     r = kb_entry.body.replace("\r", "")
     parsed = parseString(title=kb_entry.subject,
                          raw=r,
                          wikidb=cls.NOCDB(kb_entry))
     preprocess(parsed)
     xhtml = MWXHTMLWriter()
     xhtml.writeBook(parsed)
     block = ET.tostring(xhtml.xmlbody)
     return block
Example #12
0
 def xwriteTable(self, obj):
     tableclasses = obj.attributes.get('class', '').split()
     if any((tableclass in self.exclude_classes for tableclass in tableclasses)):
         return SkipChildren()
     id_attr = obj.attributes.get('id', '')
     if  id_attr in self.exclude_ids:
         return SkipChildren()
     return MWXHTMLWriter.xwriteTable(self, obj)
Example #13
0
 def xwriteGenericElement(self, obj):
     classes = obj.attributes.get('class', '').split()
     if any((cl in EXCLUDE_CLASSES for cl in classes)):
         return SkipChildren()
     id_attr = obj.attributes.get('id', '')
     if id_attr in EXCLUDED_IDS:
         return SkipChildren()
     return MWXHTMLWriter.xwriteGenericElement(self, obj)
Example #14
0
 def xwriteTable(self, obj):
     tableclasses = obj.attributes.get('class', '').split()
     if any((tableclass in EXCLUDE_CLASSES for tableclass in tableclasses)):
         return SkipChildren()
     id_attr = obj.attributes.get('id', '')
     if id_attr in EXCLUDED_IDS:
         return SkipChildren()
     return MWXHTMLWriter.xwriteTable(self, obj)
Example #15
0
 def xwriteGenericElement(self, obj):
     classes = obj.attributes.get('class', '').split()
     if any((genericclass in self.exclude_classes for genericclass in classes)):
         return SkipChildren()
     id_attr = obj.attributes.get('id', '')
     if id_attr in self.exclude_ids:
         return SkipChildren()
     return MWXHTMLWriter.xwriteGenericElement(self, obj)
Example #16
0
 def xwriteTable(self, obj):
     tableclasses = obj.attributes.get('class', '').split()
     if ( len(self.filters['EXCLUDE_CLASSES']) > 0 and
          any((tableclass in self.filters['EXCLUDE_CLASSES'] for tableclass in tableclasses))):
         return SkipChildren()
     id_attr = obj.attributes.get('id', '')
     if ( len(self.filters['EXCLUDE_IDS']) > 0 and id_attr in self.filters['EXCLUDE_IDS']):
         return SkipChildren()
     return MWXHTMLWriter.xwriteTable(self, obj)
Example #17
0
 def xwriteGenericElement(self, obj):
     classes = obj.attributes.get('class', '').split()
     if ( len(self.filters['EXCLUDE_CLASSES']) > 0 and
          any((genericclass in self.filters['EXCLUDE_CLASSES'] for genericclass in classes))):
         return SkipChildren()
     id_attr = obj.attributes.get('id', '')
     if ( len(self.filters['EXCLUDE_IDS']) > 0 and id_attr in self.filters['EXCLUDE_IDS']):
         return SkipChildren()
     return MWXHTMLWriter.xwriteGenericElement(self, obj)
Example #18
0
 def __init__(self,
              filters,
              skip_refs=False,
              env=None,
              status_callback=None,
              imagesrcresolver=None,
              debug=False):
     #self.filters = filters
     self.exclude_classes = set(filters.get('EXCLUDE_CLASSES', ()))
     self.exclude_ids = set(filters.get('EXCLUDE_IDS', ()))
     MWXHTMLWriter.__init__(self, env, status_callback, imagesrcresolver,
                            debug)
     #keep reference list for each group serparate
     self.references = defaultdict(list)
     #also keep named reference positions, separate for each group
     #map named reference to 2-tuple of position first seen and count
     self.namedrefs = defaultdict(dict)
     self.skip_refs = skip_refs
Example #19
0
 def xwriteGenericElement(self, obj):
     classes = obj.attributes.get('class', '').split()
     if any((genericclass in self.exclude_classes
             for genericclass in classes)):
         return SkipChildren()
     id_attr = obj.attributes.get('id', '')
     if id_attr in self.exclude_ids:
         return SkipChildren()
     return MWXHTMLWriter.xwriteGenericElement(self, obj)
Example #20
0
 def xwriteTable(self, obj):
     tableclasses = obj.attributes.get('class', '').split()
     if any((tableclass in self.exclude_classes
             for tableclass in tableclasses)):
         return SkipChildren()
     id_attr = obj.attributes.get('id', '')
     if id_attr in self.exclude_ids:
         return SkipChildren()
     return MWXHTMLWriter.xwriteTable(self, obj)
Example #21
0
def parseENwikt():
    wiktionaryGet.getWiktionaries(['en'])
    fh = bz2.BZ2File("enwiktionary-latest-pages-meta-current.xml.bz2")

    bg_en = {}
    en_bg = {}

    debug = False

    if debug:
        try:
            from IPython.Shell import IPShellEmbed
            ipshell = IPShellEmbed()
        except:
            from IPython import embed
            ipshell = embed

    cyrlRE = re.compile(ur'[\u0400-\u04FF\u0500-\u052F]', re.UNICODE)
    bulRE = re.compile("[bB]ulgarian", re.UNICODE)
    bulgarianSingle = re.compile("\* [bB]ulgarian", re.UNICODE)
    bulgarianSectionStart = re.compile("^==Bulgarian==$", re.UNICODE)
    bulgarianSectionEnd = re.compile("^==[A-Za-z-]+==$", re.UNICODE)

    keep = False
    read = False

    w = MWXHTMLWriter()

    while 1:
        line = fh.readline()
        if not line:
            break
        if line == "  <page>\n":
            article = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>"
            read = True
        elif line == "  </page>\n":
            read = False
            if keep:
                keep = False
                article += line
                root = xml.dom.minidom.parseString(article)
                if len(root.getElementsByTagName("text")[0].childNodes) > 0:
                    title = root.getElementsByTagName(
                        "title")[0].firstChild.data
                    text = root.getElementsByTagName("text")[0].firstChild.data
                    newText = ""
                    Bulg = False
                    for line in text.split('\n'):
                        if bulgarianSectionStart.search(line):
                            Bulg = True
                        elif bulgarianSectionEnd.search(line):
                            Bulg = False
                        if Bulg == True:
                            newText += line + '\n'
                        elif bulgarianSingle.search(line):
                            newText += line + '\n'
                    if newText is not "":
                        p = parseString(title, newText)
                        if cyrlRE.search(title):
                            if debug:
                                print "bg_en = " + newText.encode('utf-8')
                                ipshell()
                            bg_en[title] = ''.join(
                                ET.tostring(w.write(p),
                                            encoding="utf-8",
                                            method="html").split('\n'))
                        else:
                            if debug:
                                print "en_bg = " + newText.encode('utf-8')
                                ipshell()
                            en_bg[title] = ''.join(
                                ET.tostring(w.write(p),
                                            encoding="utf-8",
                                            method="html").split('\n'))
        if read:
            if bulRE.search(line):
                keep = True
            article += line

    enWiktBG = bz2.BZ2File("enWiktBG.pickle.bz2", 'wb')

    pickle.dump((bg_en, en_bg), enWiktBG, pickle.HIGHEST_PROTOCOL)

    enWiktBG.close()
Example #22
0
 def __init__(self, **kargs):
     #MWXHTMLWriter.__init__(self, **kargs)
     MWXHTMLWriter.__init__(self, None, None, "images/IMAGENAME", False)
     self.root = self.xmlbody = ET.Element("body")
     self.paratag = "p"
Example #23
0
def parseENwikt():
    wiktionaryGet.getWiktionaries(['en'])
    fh = bz2.BZ2File("enwiktionary-latest-pages-meta-current.xml.bz2")

    bg_en = {}
    en_bg = {}

    debug = False

    if debug:
        try:
            from IPython.Shell import IPShellEmbed
            ipshell = IPShellEmbed()
        except:
            from IPython import embed
            ipshell = embed

    cyrlRE = re.compile(ur'[\u0400-\u04FF\u0500-\u052F]', re.UNICODE)
    bulRE = re.compile("[bB]ulgarian", re.UNICODE)
    bulgarianSingle = re.compile("\* [bB]ulgarian", re.UNICODE)
    bulgarianSectionStart = re.compile("^==Bulgarian==$", re.UNICODE)
    bulgarianSectionEnd = re.compile("^==[A-Za-z-]+==$", re.UNICODE)

    keep = False
    read = False

    w = MWXHTMLWriter()

    while 1:
        line = fh.readline()
        if not line:
            break
        if line == "  <page>\n":
            article = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>"
            read = True
        elif line == "  </page>\n":
            read = False
            if keep:
                keep = False
                article += line
                root = xml.dom.minidom.parseString(article)
                if len(root.getElementsByTagName("text")[0].childNodes) > 0:
                    title = root.getElementsByTagName("title")[0].firstChild.data
                    text = root.getElementsByTagName("text")[0].firstChild.data
                    newText = ""
                    Bulg = False
                    for line in text.split('\n'):
                        if bulgarianSectionStart.search(line):
                            Bulg = True
                        elif bulgarianSectionEnd.search(line):
                            Bulg = False
                        if Bulg == True:
                            newText += line + '\n'
                        elif bulgarianSingle.search(line):
                            newText += line + '\n'
                    if newText is not "":
                        p = parseString(title, newText)
                        if cyrlRE.search(title):
                            if debug:
                                print "bg_en = " + newText.encode('utf-8')
                                ipshell()
                            bg_en[title] = ''.join(ET.tostring(w.write(p), encoding = "utf-8", method = "html").split('\n'))
                        else:
                            if debug:
                                print "en_bg = " + newText.encode('utf-8')
                                ipshell()
                            en_bg[title] = ''.join(ET.tostring(w.write(p), encoding = "utf-8", method = "html").split('\n'))
        if read:
            if bulRE.search(line):
                keep = True
            article += line

    enWiktBG = bz2.BZ2File("enWiktBG.pickle.bz2", 'wb')

    pickle.dump((bg_en, en_bg), enWiktBG, pickle.HIGHEST_PROTOCOL)

    enWiktBG.close()