Ejemplo n.º 1
0
 def __init__(self, config_dir, data_dir):
     self.doc = None
     self.config_dir = config_dir
     self.data_dir = data_dir
     os.chdir(data_dir)
     libxml2.initializeCatalog()
     libxml2.loadCatalog(os.path.join(data_dir, "catalog.xml"))
     parser = libxml2.createFileParserCtxt(
         os.path.join(config_dir, "jjigw.xml"))
     parser.validate(1)
     parser.parseDocument()
     if not parser.isValid():
         raise JJIGWFatalError, "Invalid configuration"
     self.doc = parser.doc()
     self.connect = ConnectConfig(self.doc.xpathEval("jjigw/connect")[0])
     self.jid = None
     self.networks = {}
     for n in self.doc.xpathEval("jjigw/network"):
         network = NetworkConfig(n)
         if not self.jid:
             self.jid = network.jid
         self.networks[network.jid.domain] = network
     spidentd = self.doc.xpathEval("jjigw/spidentd")
     if spidentd:
         self.spidentd = SPIdentDConfig(spidentd[0])
     else:
         self.spidentd = None
     self.admins = []
     for n in self.doc.xpathEval("jjigw/admin"):
         self.admins.append(JID(n.getContent()))
Ejemplo n.º 2
0
 def __init__(self,config_dir,data_dir):
     self.doc=None
     self.config_dir=config_dir
     self.data_dir=data_dir
     os.chdir(data_dir)
     libxml2.initializeCatalog()
     libxml2.loadCatalog(os.path.join(data_dir,"catalog.xml"))
     parser=libxml2.createFileParserCtxt(os.path.join(config_dir,"jjigw.xml"))
     parser.validate(1)
     parser.parseDocument()
     if not parser.isValid():
         raise JJIGWFatalError,"Invalid configuration"
     self.doc=parser.doc()
     self.connect=ConnectConfig(self.doc.xpathEval("jjigw/connect")[0])
     self.jid=None
     self.networks={}
     for n in self.doc.xpathEval("jjigw/network"):
         network=NetworkConfig(n)
         if not self.jid:
             self.jid=network.jid
         self.networks[network.jid.domain]=network
     spidentd=self.doc.xpathEval("jjigw/spidentd")
     if spidentd:
         self.spidentd=SPIdentDConfig(spidentd[0])
     else:
         self.spidentd=None
     self.admins=[]
     for n in self.doc.xpathEval("jjigw/admin"):
         self.admins.append(JID(n.getContent()))
Ejemplo n.º 3
0
 def load(self):
     """ Load the document """
     if self.filepath is None:
         raise DocumentError('The document source file path is not defined.')
     # Create the catalog lists
     catalogs = self.get_catalogs_list()
     # Load the catalogs
     for catalog in catalogs:
         libxml2.loadCatalog(catalog)
     # Load the document
     self.xml = libxml2.parseFile(self.filepath)
     # Process the XInclude part
     res = self.xml.xincludeProcess()
Ejemplo n.º 4
0
def xsl_transform(content, bDownloadImages):
    # 1
    strTidiedHtml = tidy_and_premail(content)

    # 2 Settings for libxml2 for transforming XHTML entities  to valid XML
    libxml2.loadCatalog(XHTML_ENTITIES)
    libxml2.lineNumbersDefault(1)
    libxml2.substituteEntitiesDefault(1)

    # 3 First XSLT transformation
    styleDoc1 = libxml2.parseFile(GDOCS2CNXML_XSL1)
    style1 = libxslt.parseStylesheetDoc(styleDoc1)
    # doc1 = libxml2.parseFile(afile))
    doc1 = libxml2.parseDoc(strTidiedHtml)
    result1 = style1.applyStylesheet(doc1, None)
    #style1.saveResultToFilename(os.path.join('output', docFilename + '_meta.xml'), result1, 1)
    strResult1 = style1.saveResultToString(result1)
    style1.freeStylesheet()
    doc1.freeDoc()
    result1.freeDoc()

    # Parse XML with etree from lxml for TeX2MathML and image download
    etreeXml = etree.fromstring(strResult1)

    # 4 Convert TeX to MathML with Blahtex
    etreeXml = tex2mathml(etreeXml)

    # 5 Optional: Download Google Docs Images
    imageObjects = {}
    if bDownloadImages:
        etreeXml, imageObjects = downloadImages(etreeXml)

    # Convert etree back to string
    strXml = etree.tostring(etreeXml)  # pretty_print=True)

    # 6 Second transformation
    styleDoc2 = libxml2.parseFile(GDOCS2CNXML_XSL2)
    style2 = libxslt.parseStylesheetDoc(styleDoc2)
    doc2 = libxml2.parseDoc(strXml)
    result2 = style2.applyStylesheet(doc2, None)
    #style2.saveResultToFilename('tempresult.xml', result2, 0) # just for debugging
    strResult2 = style2.saveResultToString(result2)
    style2.freeStylesheet()
    doc2.freeDoc()
    result2.freeDoc()

    return strResult2, imageObjects
def xsl_transform(content, bDownloadImages):
    # 1
    strTidiedHtml = tidy_and_premail(content)

    # 2 Settings for libxml2 for transforming XHTML entities  to valid XML
    libxml2.loadCatalog(XHTML_ENTITIES)
    libxml2.lineNumbersDefault(1)
    libxml2.substituteEntitiesDefault(1)

    # 3 First XSLT transformation
    styleDoc1 = libxml2.parseFile(GDOCS2CNXML_XSL1)
    style1 = libxslt.parseStylesheetDoc(styleDoc1)
    # doc1 = libxml2.parseFile(afile))
    doc1 = libxml2.parseDoc(strTidiedHtml)
    result1 = style1.applyStylesheet(doc1, None)
    #style1.saveResultToFilename(os.path.join('output', docFilename + '_meta.xml'), result1, 1)
    strResult1 = style1.saveResultToString(result1)
    style1.freeStylesheet()
    doc1.freeDoc()
    result1.freeDoc()

    # Parse XML with etree from lxml for TeX2MathML and image download
    etreeXml = etree.fromstring(strResult1)

    # 4 Convert TeX to MathML with Blahtex
    etreeXml = tex2mathml(etreeXml)

    # 5 Optional: Download Google Docs Images
    imageObjects = {}
    if bDownloadImages:
        etreeXml, imageObjects = downloadImages(etreeXml)

    # Convert etree back to string
    strXml = etree.tostring(etreeXml) # pretty_print=True)

    # 6 Second transformation
    styleDoc2 = libxml2.parseFile(GDOCS2CNXML_XSL2)
    style2 = libxslt.parseStylesheetDoc(styleDoc2)
    doc2 = libxml2.parseDoc(strXml)
    result2 = style2.applyStylesheet(doc2, None)
    #style2.saveResultToFilename('tempresult.xml', result2, 0) # just for debugging
    strResult2 = style2.saveResultToString(result2)
    style2.freeStylesheet()
    doc2.freeDoc()
    result2.freeDoc()

    return strResult2, imageObjects
Ejemplo n.º 6
0
def xsl_transform(content, bDownloadImages, base_or_source_url='.'):
    use_readability = True

    # 1 get title with readability
    html_title = "Untitled"
    try:
        html_title = Document(content).title()
    except:
        pass        
    
    # 2 use readabilty to get content
    if use_readability:
        readable_article = Document(content).summary()
    else:
        readable_article = content

    # 3 tidy and premail
    strTidiedHtml = tidy_and_premail(readable_article)

    # 4 Load XHTML catalog files: Makes XHTML entities readable.
    libxml2.loadCatalog(XHTML_ENTITIES)
    libxml2.lineNumbersDefault(1)
    libxml2.substituteEntitiesDefault(1)

    # 5 XSLT transformation
    styleDoc1 = libxml2.parseFile(XHTML2CNXML_XSL1)
    style1 = libxslt.parseStylesheetDoc(styleDoc1)
    # doc1 = libxml2.parseFile(afile))
    doc1 = libxml2.parseDoc(strTidiedHtml)
    result1 = style1.applyStylesheet(doc1, None)
    #style1.saveResultToFilename(os.path.join('output', docFilename + '_meta.xml'), result1, 1)
    strResult1 = style1.saveResultToString(result1)
    # print strResult1
    style1.freeStylesheet()
    doc1.freeDoc()
    result1.freeDoc()

    # Parse XML with etree from lxml for TeX2MathML and image download
    etreeXml = etree.fromstring(strResult1)

    # 6 Convert TeX to MathML with Blahtex (not in XHTML)
    # etreeXml = tex2mathml(etreeXml)

    # 7 Optional: Download Google Docs Images
    imageObjects = {}
    if bDownloadImages:
        etreeXml, imageObjects = downloadImages(etreeXml, base_or_source_url)
        
    # 8 add title from html
    etreeXml = add_cnxml_title(etreeXml, html_title)

    # Convert etree back to string
    strXml = etree.tostring(etreeXml) # pretty_print=True)

    # 9 Second transformation
    styleDoc2 = libxml2.parseFile(XHTML2CNXML_XSL2)
    style2 = libxslt.parseStylesheetDoc(styleDoc2)
    doc2 = libxml2.parseDoc(strXml)
    result2 = style2.applyStylesheet(doc2, None)
    #style2.saveResultToFilename('tempresult.xml', result2, 0) # just for debugging
    strResult2 = style2.saveResultToString(result2)
    style2.freeStylesheet()
    doc2.freeDoc()
    result2.freeDoc()
    
    return strResult2, imageObjects, html_title     
Ejemplo n.º 7
0
def init_libxml2(xml):
    libxml2.loadCatalog(XHTML_ENTITIES)
    libxml2.lineNumbersDefault(1)
    libxml2.substituteEntitiesDefault(1)
    return xml, {}
Ejemplo n.º 8
0
def xsl_transform(content, bDownloadImages, base_or_source_url='.'):

    html_title = "Untitled"

    # 1 get title with readability
    # ONLY MAKES SENSE FOR AN UNKNOWN HTML, SO I COMMENTED IT OUT FOR https://github.com/Connexions/rhaptos.html2cnxml
    #try:
    #    html_title = Document(content).title()
    #except:
    #    pass

    # 2 use readabilty to get content
    # ONLY MAKES SENSE FOR AN UNKNOWN HTML, SO I COMMENTED IT OUT FOR https://github.com/Connexions/rhaptos.html2cnxml
    #readable_article = Document(content).summary()
    readable_article = content

    # 3 tidy and premail
    strTidiedHtml = tidy_and_premail(readable_article)

    # 4 Load XHTML catalog files: Makes XHTML entities readable.
    libxml2.loadCatalog(XHTML_ENTITIES)
    libxml2.lineNumbersDefault(1)
    libxml2.substituteEntitiesDefault(1)

    # 5 XSLT transformation
    styleDoc1 = libxml2.parseFile(XHTML2CNXML_XSL1)
    style1 = libxslt.parseStylesheetDoc(styleDoc1)
    # doc1 = libxml2.parseFile(afile))
    doc1 = libxml2.parseDoc(strTidiedHtml)
    result1 = style1.applyStylesheet(doc1, None)
    #style1.saveResultToFilename(os.path.join('output', docFilename + '_meta.xml'), result1, 1)
    strResult1 = style1.saveResultToString(result1)
    style1.freeStylesheet()
    doc1.freeDoc()
    result1.freeDoc()

    # Parse XML with etree from lxml for TeX2MathML and image download
    etreeXml = etree.fromstring(strResult1)

    # 6 Convert TeX to MathML with Blahtex (not in XHTML)
    # etreeXml = tex2mathml(etreeXml)

    # 7 Optional: Download Google Docs Images
    imageObjects = {}
    if bDownloadImages:
        etreeXml, imageObjects = downloadImages(etreeXml, base_or_source_url)

    # 8 add title from html
    etreeXml = add_cnxml_title(etreeXml, html_title)

    # Convert etree back to string
    strXml = etree.tostring(etreeXml)  # pretty_print=True)

    # 9 Second transformation
    styleDoc2 = libxml2.parseFile(XHTML2CNXML_XSL2)
    style2 = libxslt.parseStylesheetDoc(styleDoc2)
    doc2 = libxml2.parseDoc(strXml)
    result2 = style2.applyStylesheet(doc2, None)
    #style2.saveResultToFilename('tempresult.xml', result2, 0) # just for debugging
    strResult2 = style2.saveResultToString(result2)
    style2.freeStylesheet()
    doc2.freeDoc()
    result2.freeDoc()

    return strResult2, imageObjects, html_title
def init_libxml2(xml):
    libxml2.loadCatalog(XHTML_ENTITIES)
    libxml2.lineNumbersDefault(1)
    libxml2.substituteEntitiesDefault(1)
    return xml, {}