Beispiel #1
0
 def is_cache_full(self):
     """
     Check if cache has more than one document
     """
     if os.path.exists(self.cache_file):
         reader = SAXReader()
         cache_doc = reader.read(
             File(self.cache_file)            
         )
         pinfo = ParliamentInfoParams()
         list_of_cached_nodes = cache_doc.selectNodes(
             pinfo._xpath_content_types()
         )
         if not self.bicameral:
             """
             If its unicameral
             """
             if len(list_of_cached_nodes) == 1:
                 return True
             else:
                 return False
         else:
             """
             If its bicameral
             """
             if len(list_of_cached_nodes) == 2:
                 return True
             else:
                 return False
     return False 
Beispiel #2
0
 def append_to_cache(self, input_file):
     reader = SAXReader()
     new_doc = reader.read(
         File(input_file)
     )
     element_to_import = new_doc.getRootElement()
     self.append_element_into_cache_document(element_to_import)
Beispiel #3
0
 def getPayloadContent(self):
     format = self.__metadata.getField("dc_format")
     slash = self.__oid.rfind("/")
     pid = self.__oid[slash+1:]
     print " *** payload content, format: %s, pid: %s *** " % (format, pid)
     contentStr = ""
     if format.startswith("text"):
         contentStr = "<pre>"
         payload = self.__storage.getPayload(self.__oid, pid)
         str = StringWriter() 
         IOUtils.copy(payload.getInputStream(), str)
         contentStr += str.toString()
         contentStr += "</pre>"
     elif format.find("vnd.ms-")>-1 or format.find("vnd.oasis.opendocument.")>-1:
         #get the html version if exist....
         pid = pid[:pid.find(".")] + ".htm"
         payload = self.__storage.getPayload(self.__oid, pid)
         saxReader = SAXReader()
         document = saxReader.read(payload.getInputStream())
         slideNode = document.selectSingleNode("//div[@class='body']")
         #linkNodes = slideNode.selectNodes("//img")
         #contentStr = slideNode.asXML();
         # encode character entities correctly
         out = ByteArrayOutputStream()
         format = OutputFormat.createPrettyPrint()
         format.setSuppressDeclaration(True)
         writer = XMLWriter(out, format)
         writer.write(slideNode)
         writer.close()
         contentStr = out.toString("UTF-8")
     return contentStr
Beispiel #4
0
 def getPayloadContent(self):
     mimeType = self.__mimeType
     print " * single.py: payload content mimeType=%s" % mimeType
     contentStr = ""
     if mimeType.startswith("text/"):
         if mimeType == "text/html":
             contentStr = '<iframe class="iframe-preview" src="%s/%s/download/%s"></iframe>' % (
                 contextPath,
                 portalId,
                 self.__oid,
             )
         else:
             pid = self.__oid[self.__oid.rfind("/") + 1 :]
             payload = self.__storage.getPayload(self.__oid, pid)
             print " * single.py: pid=%s payload=%s" % (pid, payload)
             if payload is not None:
                 sw = StringWriter()
                 sw.write("<pre>")
                 IOUtils.copy(payload.getInputStream(), sw)
                 sw.write("</pre>")
                 sw.flush()
                 contentStr = sw.toString()
     elif (
         mimeType == "application/pdf"
         or mimeType.find("vnd.ms") > -1
         or mimeType.find("vnd.oasis.opendocument.") > -1
     ):
         # get the html version if exist...
         pid = os.path.splitext(self.__pid)[0] + ".htm"
         print " * single.py: pid=%s" % pid
         # contentStr = '<iframe class="iframe-preview" src="%s/%s/download/%s/%s"></iframe>' % \
         #    (contextPath, portalId, self.__oid, pid)
         payload = self.__storage.getPayload(self.__oid, pid)
         saxReader = SAXReader(Boolean.parseBoolean("false"))
         try:
             document = saxReader.read(payload.getInputStream())
             slideNode = document.selectSingleNode("//*[local-name()='body']")
             # linkNodes = slideNode.selectNodes("//img")
             # contentStr = slideNode.asXML();
             # encode character entities correctly
             slideNode.setName("div")
             out = ByteArrayOutputStream()
             format = OutputFormat.createPrettyPrint()
             format.setSuppressDeclaration(True)
             format.setExpandEmptyElements(True)
             writer = XMLWriter(out, format)
             writer.write(slideNode)
             writer.close()
             contentStr = out.toString("UTF-8")
         except:
             traceback.print_exc()
             contentStr = '<p class="error">No preview available</p>'
     elif mimeType.startswith("image/"):
         src = "%s/%s" % (self.__oid, self.__pid)
         contentStr = (
             '<a class="image" href="%(src)s"  style="max-width:98%%">'
             '<img src="%(src)s" style="max-width:100%%" /></a>' % {"src": self.__pid}
         )
     return contentStr
Beispiel #5
0
 def getPayloadContent(self):
     mimeType = self.__mimeType
     print " * detail.py: payload content mimeType=%s" % mimeType
     contentStr = ""
     if mimeType == "application/octet-stream":
         dcFormat = self.__json.get("response/docs/dc_format")
         if dcFormat is not None:
             dcFormat = dcFormat[1:-1]
         print dcFormat, mimeType
         if dcFormat != mimeType:
             return "<div><em>(File not found)</em></div>"
         else:
             return "<div><em>(Binary file)</em></div>"
     elif mimeType.startswith("text/"):
         if mimeType == "text/html":
             contentStr = '<iframe class="iframe-preview" src="%s/%s/download/%s"></iframe>' % \
                 (contextPath, portalId, self.__oid)
         else:
             pid = self.__oid[self.__oid.rfind("/")+1:]
             payload = self.__storage.getPayload(self.__oid, pid)
             #print " * detail.py: pid=%s payload=%s" % (pid, payload)
             if payload is not None:
                 sw = StringWriter()
                 sw.write("<pre>")
                 IOUtils.copy(payload.getInputStream(), sw)
                 sw.write("</pre>")
                 sw.flush()
                 contentStr = sw.toString()
     elif mimeType == "application/pdf" or mimeType.find("vnd.ms")>-1 or mimeType.find("vnd.oasis.opendocument.")>-1:
         # get the html version if exist...
         pid = os.path.splitext(self.__pid)[0] + ".htm"
         print " * detail.py: pid=%s" % pid
         #contentStr = '<iframe class="iframe-preview" src="%s/%s/download/%s/%s"></iframe>' % \
         #    (contextPath, portalId, self.__oid, pid)
         payload = self.__storage.getPayload(self.__oid, pid)
         saxReader = SAXReader(Boolean.parseBoolean("false"))
         try:
             document = saxReader.read(payload.getInputStream())
             slideNode = document.selectSingleNode("//*[local-name()='body']")
             #linkNodes = slideNode.selectNodes("//img")
             #contentStr = slideNode.asXML();
             # encode character entities correctly
             slideNode.setName("div")
             out = ByteArrayOutputStream()
             format = OutputFormat.createPrettyPrint()
             format.setSuppressDeclaration(True)
             format.setExpandEmptyElements(True)
             writer = XMLWriter(out, format)
             writer.write(slideNode)
             writer.close()
             contentStr = out.toString("UTF-8")
         except:
             traceback.print_exc()
             contentStr = "<p class=\"error\">No preview available</p>"
     return contentStr
Beispiel #6
0
def loadConfig():
    """
    Load the configuration file
    return Document
    """
    document = None
    reader = SAXReader()
    try:
        document = reader.read(CONF_FILE)
    except DocumentException, detail:
        print "Error: %s" % detail.getMessage()
Beispiel #7
0
def test(docname):
    """Test the functions in this module.
    """
    reader = SAXReader()
    doc = reader.read(docname)
    show_tree(doc, 'Original tree:')
    date = time.ctime()
    modify_tree(doc, 'person', 'date', date)
    show_tree(doc, 'After walk and modify:')
    modify_tree_xpath(doc, '//people/person/name', 'date', date)
    show_tree(doc, 'After XPath modify:')
Beispiel #8
0
 def append_to_cache(self, input_file, search_node):
     reader = SAXReader()
     new_doc = reader.read(
         File(input_file)
     )
     found_node = new_doc.selectSingleNode(search_node)
     found_id = found_node.getText()
     if not self.does_item_exist_in_cache(found_id):
         element_to_import = new_doc.getRootElement()
         self.append_element_into_cache_document(element_to_import)
     else:
         LOG.info(found_id + " already exists in cache")        
Beispiel #9
0
 def append_to_cache(self, input_file):
     reader = SAXReader()
     new_doc = reader.read(
         File(input_file)
     )
     parl_node = new_doc.selectSingleNode("/contenttype[@name='parliament']/field[@name='parliament_id']")
     parliament_id = parl_node.getText()
     if not self.does_parliament_exists_in_cache(parliament_id):
         element_to_import = new_doc.getRootElement()
         self.append_element_into_cache_document(element_to_import)
     else:
         LOG.info(parliament_id + " already exists in cache")
 def getPayloadContent(self):
     mimeType = self.__mimeType
     print " * single.py: payload content mimeType=%s" % mimeType
     contentStr = ""
     if mimeType.startswith("text/"):
         if mimeType == "text/html":
             contentStr = '<iframe class="iframe-preview" src="%s/download/%s"></iframe>' % \
                 (portalPath, self.__oid)
         else:
             pid = self.__oid[self.__oid.rfind("/") + 1:]
             payload = self.__storage.getPayload(self.__oid, pid)
             print " * single.py: pid=%s payload=%s" % (pid, payload)
             if payload is not None:
                 sw = StringWriter()
                 sw.write("<pre>")
                 IOUtils.copy(payload.getInputStream(), sw)
                 sw.write("</pre>")
                 sw.flush()
                 contentStr = sw.toString()
     elif mimeType == "application/pdf" or mimeType.find(
             "vnd.ms") > -1 or mimeType.find(
                 "vnd.oasis.opendocument.") > -1:
         # get the html version if exist...
         pid = os.path.splitext(self.__pid)[0] + ".htm"
         print " * single.py: pid=%s" % pid
         #contentStr = '<iframe class="iframe-preview" src="%s/download/%s/%s"></iframe>' % \
         #    (portalPath, self.__oid, pid)
         payload = self.__storage.getPayload(self.__oid, pid)
         saxReader = SAXReader(Boolean.parseBoolean("false"))
         try:
             document = saxReader.read(payload.getInputStream())
             slideNode = document.selectSingleNode(
                 "//*[local-name()='body']")
             #linkNodes = slideNode.selectNodes("//img")
             #contentStr = slideNode.asXML();
             # encode character entities correctly
             slideNode.setName("div")
             out = ByteArrayOutputStream()
             format = OutputFormat.createPrettyPrint()
             format.setSuppressDeclaration(True)
             format.setExpandEmptyElements(True)
             writer = XMLWriter(out, format)
             writer.write(slideNode)
             writer.close()
             contentStr = out.toString("UTF-8")
         except:
             traceback.print_exc()
             contentStr = "<p class=\"error\">No preview available</p>"
     elif mimeType.startswith("image/"):
         src = "%s/%s" % (self.__oid, self.__pid)
         contentStr = '<a class="image" href="%(src)s"  style="max-width:98%%">' \
             '<img src="%(src)s" style="max-width:100%%" /></a>' % { "src": self.__pid }
     return contentStr
Beispiel #11
0
 def new_cache(self, input_file):
     """
     Takes the input file, creates a new empty cache document, 
     and adds the input file to the cache 
     """
     self.new_cache_document()
     reader = SAXReader()
     new_doc = reader.read(
         File(input_file)
     )
     element_to_import = new_doc.getRootElement()
     self.append_element_into_cache_document(element_to_import)
Beispiel #12
0
    def __activate__(self, context):
        self.log = context["log"]
        self.config = context["systemConfig"]
        response = context["response"]

        try:
            ## Variable prep
            defaultPath = FascinatorHome.getPath("alerts")
            self.alertsPath = self.config.getString(defaultPath,
                                                    ["alerts", "path"])
            self.configFile = None  # We'll allocate this later... if needed
            self.redboxVersion = self.config.getString(
                "", "redbox.version.string")
            self.csvDialect = csv.excel
            self.csvDialect.skipinitialspace = True

            ## XML Parsing
            docFactory = DocumentFactory()
            ##docFactory.setXPathNamespaceURIs(namespaces)
            self.saxReader = SAXReader(docFactory)

            ## Do our job
            (success, failed) = self.__processDir()

            ## Send response to the client (if debugging in browser)
            writer = response.getPrintWriter("text/plain; charset=UTF-8")
            writer.println("%s successful, %s failed" % (success, failed))
            writer.close()

        except Exception, e:
            response.setStatus(500)
            writer = response.getPrintWriter("text/plain; charset=UTF-8")
            writer.println("Unexpected error during script execution:\n%s" %
                           str(e))
            writer.close()
Beispiel #13
0
 def is_cache_full(self):
     """
     Check if cache has more than one document
     """
     if os.path.exists(self.cache_file):
         reader = SAXReader()
         cache_doc = reader.read(
             File(self.cache_file)            
         )
         #pinfo = ParliamentInfoParams()
         list_of_cached_nodes = cache_doc.selectNodes(
             self.axis_to_check_cache_full()
         )
         if len(list_of_cached_nodes) == self.chambers_required:
             return True
         else:
             return False
     return False 
Beispiel #14
0
    def __init__(self, file, config, baseline):
        AlertHandler.__init__(self, file, config, baseline)
        docFactory = DocumentFactory()
        self.saxReader = SAXReader(docFactory)
        self.xmlMapFile = StrSubstitutor.replaceSystemProperties(
            config['xmlMap'])
        if not os.path.exists(self.xmlMapFile):
            raise AlertException("Requested xmlMap file %s does not exist." %
                                 self.xmlMapFile)

        ## Make sure we can see our mappings
        inStream = FileInputStream(File(self.xmlMapFile))
        xmlMappings = JsonSimple(inStream)
        self.map = xmlMappings.getObject(["mappings"])
        self.exceptions = xmlMappings.getObject(["exceptions"])
        self.defaultNamespace = xmlMappings.getObject(["defaultNamespace"])

        self.mappedExceptionCount = 0
Beispiel #15
0
 def __getPayloadContent(self, oid, pid):
     print " * combined.py: oid='%s' pid='%s'" % (oid, pid)
     payload = self.__storage.getPayload(oid, pid)
     if payload is None:
         return "<div>Error: No content for '%s'</div>" % oid
     mimeType = payload.contentType
     contentStr = ""
     if mimeType.startswith("text/"):
         if mimeType == "text/html":
             contentStr = '<iframe class="iframe-preview" src="%s/download/%s"></iframe>' % \
                 (portalPath, oid)
         else:
             sw = StringWriter()
             sw.write("<pre>")
             IOUtils.copy(payload.getInputStream(), sw)
             sw.write("</pre>")
             sw.flush()
             contentStr = sw.toString()
     elif mimeType == "application/pdf" or mimeType.find("vnd.ms")>-1 or mimeType.find("vnd.oasis.opendocument.")>-1:
         # get the html version if exist...
         pid = os.path.splitext(pid)[0] + ".htm"
         print " * combined.py: pid=%s" % pid
         payload = self.__storage.getPayload(oid, pid)
         saxReader = SAXReader(False)
         try:
             document = saxReader.read(payload.getInputStream())
             slideNode = document.selectSingleNode("//*[local-name()='body']")
             slideNode.setName("div")
             out = ByteArrayOutputStream()
             format = OutputFormat.createPrettyPrint()
             format.setSuppressDeclaration(True)
             format.setExpandEmptyElements(True)
             writer = XMLWriter(out, format)
             writer.write(slideNode)
             writer.close()
             contentStr = out.toString("UTF-8")
         except:
             traceback.print_exc()
             contentStr = "<p class=\"error\">No preview available</p>"
     elif mimeType.startswith("image/"):
         src = "%s/%s" % (oid, pid)
         contentStr = '<a class="image" href="%(src)s"  style="max-width:98%%">' \
             '<img src="%(src)s" style="max-width:100%%" /></a>' % { "src": pid }
     return contentStr
Beispiel #16
0
 def getPayloadContent(self):
     mimeType = self.__mimeType
     print " * detail.py: payload content mimeType=%s" % mimeType
     contentStr = ""
     if mimeType.startswith("text/"):
         if mimeType == "text/html":
             contentStr = '<iframe class="iframe-preview" src="%s/%s/download/%s"></iframe>' % \
                 (contextPath, portalId, self.__oid)
         else:
             pid = self.__oid[self.__oid.rfind("/")+1:]
             payload = self.__storage.getPayload(self.__oid, pid)
             print " * detail.py: pid=%s payload=%s" % (pid, payload)
             if payload is not None:
                 sw = StringWriter()
                 sw.write("<pre>")
                 IOUtils.copy(payload.getInputStream(), sw)
                 sw.write("</pre>")
                 sw.flush()
                 contentStr = sw.toString()
     elif mimeType == "application/pdf" or mimeType.find("vnd")>-1 or mimeType.find("vnd.oasis.opendocument.")>-1:
         # get the html version if exist...
         pid = os.path.splitext(self.__pid)[0] + ".htm"
         print " * detail.py: pid=%s" % pid
         #contentStr = '<iframe class="iframe-preview" src="%s/%s/download/%s/%s"></iframe>' % \
         #    (contextPath, portalId, self.__oid, pid)
         payload = self.__storage.getPayload(self.__oid, pid)
         saxReader = SAXReader(Boolean.parseBoolean("false"))
         try:
             document = saxReader.read(payload.getInputStream())
         except:
             traceback.print_exc()
         #slideNode = document.selectSingleNode("//div[@class='body']")
         slideNode = document.selectSingleNode("//*[local-name()='body']")
         #linkNodes = slideNode.selectNodes("//img")
         #contentStr = slideNode.asXML();
         # encode character entities correctly
         out = ByteArrayOutputStream()
         format = OutputFormat.createPrettyPrint()
         format.setSuppressDeclaration(True)
         writer = XMLWriter(out, format)
         writer.write(slideNode)
         writer.close()
         contentStr = out.toString("UTF-8")
     return contentStr
Beispiel #17
0
 def __init__(self, file, config, baseline):
     AlertHandler.__init__(self, file, config, baseline)
     docFactory = DocumentFactory()
     self.saxReader = SAXReader(docFactory)
     self.xmlMapFile = StrSubstitutor.replaceSystemProperties(config['xmlMap'])
     if not os.path.exists(self.xmlMapFile):
         raise AlertException("Requested xmlMap file %s does not exist." % self.xmlMapFile)
     
     ## Make sure we can see our mappings
     inStream = FileInputStream(File(self.xmlMapFile))
     xmlMappings = JsonSimple(inStream)
     self.map = xmlMappings.getObject(["mappings"])
     self.exceptions = xmlMappings.getObject(["exceptions"])
     self.defaultNamespace = xmlMappings.getObject(["defaultNamespace"])
     
     self.mappedExceptionCount = 0
Beispiel #18
0
def test(indocname):
    reader = SAXReader()
    doc = reader.read(indocname)
    visitor = Visitor()
    doc.accept(visitor)
Beispiel #19
0
    def __createEpub(self):
        title = self.__manifest.get("title")
        
        response.setHeader("Content-Disposition", "attachment; filename=%s.epub" %  urllib.quote(title))
        out = response.getOutputStream("application/epub+zip")
        zipOutputStream = ZipOutputStream(out)
        
        #save mimetype... and the rest of standard files in epub
        zipOutputStream.putNextEntry(ZipEntry("mimetype"))
        epubMimetypeStream = self.__getResourceAsStream("/epub/mimetype")
        IOUtils.copy(epubMimetypeStream, zipOutputStream)
        zipOutputStream.closeEntry()
        
        zipOutputStream.putNextEntry(ZipEntry("META-INF/container.xml"))
        epubContainerStream = self.__getResourceAsStream("/epub/container.xml")
        IOUtils.copy(epubContainerStream, zipOutputStream)
        zipOutputStream.closeEntry()

        zipOutputStream.putNextEntry(ZipEntry("OEBPS/epub.css"))
        epubcss = self.__getResourceAsStream("/epub/epub.css")
        IOUtils.copy(epubcss, zipOutputStream)
        zipOutputStream.closeEntry()
        
        #### Creating toc.ncx ####
        tocXml = ElementTree.Element("ncx", {"version": "2005-1", "xml:lang":"en", "xmlns":"http://www.daisy.org/z3986/2005/ncx/"})
        headNode = ElementTree.Element("head")
        tocXml.append(headNode)
        
        headNode.append(ElementTree.Element("meta", {"name": "dtb:uid", "content": "1"}))
        headNode.append(ElementTree.Element("meta", {"name": "dtb:depth", "content": "1"}))
        headNode.append(ElementTree.Element("meta", {"name": "dtb:totalPageCount", "content": "1"}))
        headNode.append(ElementTree.Element("meta", {"name": "dtb:maxPageNumber", "content": "1"}))
        headNode.append(ElementTree.Element("meta", {"name": "dtb:generator", "content": "ICE v2"}))
        
        #docTitle
        docTitle = ElementTree.Element("docTitle")
        textNode = ElementTree.Element("text")
        textNode.text = title
        docTitle.append(textNode)
        tocXml.append(docTitle)
        
        #docAuthor
        docAuthor = ElementTree.Element("docAuthor")
        textNode = ElementTree.Element("text")
        textNode.text = "ICE v2"
        docAuthor.append(textNode)
        tocXml.append(docAuthor)
        
        #navMap
        navMap = ElementTree.Element("navMap")
        tocXml.append(navMap)
        
        #### Creating content.opf ####
        contentXml = ElementTree.Element("package", {"version": "2.0", "xmlns":"http://www.idpf.org/2007/opf",
                                                     "unique-identifier":"BookId"})
        
        metadataNode = ElementTree.Element("metadata", {"xmlns:dc": "http://purl.org/dc/elements/1.1/", 
                                                        "xmlns:opf": "http://www.idpf.org/2007/opf"})
        contentXml.append(metadataNode)
        
        #metadata information
        metadata = ElementTree.Element("dc:title")
        metadata.text = title
        metadataNode.append(metadata)
        
        metadata = ElementTree.Element("dc:language")
        metadata.text = "en-AU"
        metadataNode.append(metadata)
        
        metadata = ElementTree.Element("dc:creator", {"opf:role":"aut"})
        metadata.text = "ICE"
        metadataNode.append(metadata)
        
        metadata = ElementTree.Element("dc:publisher")
        metadata.text = "University of Southern Queensland"
        metadataNode.append(metadata)
        
        metadata = ElementTree.Element("dc:identifier", {"id":"BookId"})
        metadata.text = title
        metadataNode.append(metadata)
        
        #manifest
        manifest = ElementTree.Element("manifest")
        contentXml.append(manifest)
        
        spine = ElementTree.Element("spine", {"toc":"ncx"})
        contentXml.append(spine)
        
        item = ElementTree.Element("item", {"id":"ncx", "href":"toc.ncx", "media-type":"text/xml"})
        manifest.append(item)
        css = ElementTree.Element("item", {"id":"style", "href":"epub.css", "media-type":"text/css"})
        manifest.append(css)
        
        count = 1
        for itemHash in self.__orderedItem:
            id, title, htmlFileName, payloadDict, isImage = self.__itemRefDict[itemHash]
            
            for payloadId in payloadDict:
                payload, payloadType = payloadDict[payloadId]
                if isinstance(payload, Payload):
                    payloadId = payloadId.lower()
                    zipEntryId = payloadId.replace(" ", "_").replace("\\", "/")
                    if payloadType == "application/xhtml+xml":
                        zipOutputStream.putNextEntry(ZipEntry("OEBPS/%s" % zipEntryId))
                        ##process the html....
                        saxReader = SAXReader(False)
                        try:
                            saxDoc = saxReader.read(payload.open())
                            payload.close()
#                            ## remove class or style nodes
#                            classOrStyleNodes = saxDoc.selectNodes("//@class | //@style ")
#                            for classOrStyleNode in classOrStyleNodes:
#                                node = classOrStyleNode
#                                if classOrStyleNode.getParent():
#                                    node = classOrStyleNode.getParent()
#                                if node.getQualifiedName() == "img":
#                                    attr = node.attribute(QName("class"))
#                                    attr = node.attribute(QName("class"))
#                                    if attr:
#                                        node.remove(attr)
#                                    attr = node.attribute(QName("style"))
#                                    if attr:
#                                        node.remove(attr)
                                    
                            ## remove name in a tags
                            ahrefs = saxDoc.selectNodes("//*[local-name()='a' and @name!='']")
                            for a in ahrefs:
                                attr = a.attribute(QName("name"))
                                if attr:
                                    a.remove(attr)
                                    
                            ## fix images src name.... replace space with underscore and all lower case
                            imgs = saxDoc.selectNodes("//*[local-name()='img' and contains(@src, '_files')]")
                            for img in imgs:
                                srcAttr = img.attribute(QName("src"))
                                if srcAttr:
                                    src = srcAttr.getValue()
                                    #hash the sourcename
                                    filepath, filename = os.path.split(src)
                                    filename, ext = os.path.splitext(filename)
                                    filename = hashlib.md5(filename).hexdigest()
                                    src = os.path.join(filepath.lower().replace(" ", "_"), "node-%s%s" % (filename, ext))
                                    img.addAttribute(QName("src"), src.replace(" ", "_"))
                                
                            bodyNode = saxDoc.selectSingleNode("//*[local-name()='div' and @class='body']")
                            bodyNode.setName("div")
                            
                            out = ByteArrayOutputStream()
                            format = OutputFormat.createPrettyPrint()
                            format.setSuppressDeclaration(True)
                            writer = XMLWriter(out, format)
                            writer.write(bodyNode)
                            writer.flush()
                            contentStr = out.toString("UTF-8")
                            
                            htmlString = """<?xml version="1.0" encoding="UTF-8"?>
                                            <html xmlns="http://www.w3.org/1999/xhtml"><head><title>%s</title>
                                            <link rel="stylesheet" href="epub.css"/>
                                            </head><body>%s</body></html>"""
                            htmlString = htmlString % (title, contentStr)
                            self.__copyString(htmlString, zipOutputStream)
                            includeFile = False
                        except:
                            traceback.print_exc()
                    else:
                        #images....
                        zipOutputStream.putNextEntry(ZipEntry("OEBPS/%s" % zipEntryId))
                        IOUtils.copy(payload.open(), zipOutputStream)
                        payload.close()
                        zipOutputStream.closeEntry()
                else:
                    zipOutputStream.putNextEntry(ZipEntry("OEBPS/%s" % zipEntryId))
                    IOUtils.copy(payload, zipOutputStream)
                    zipOutputStream.closeEntry()
                
                itemNode = ElementTree.Element("item", {"media-type":payloadType, "href": zipEntryId})  
                if payloadId == htmlFileName.lower():
                    itemNode.set("id", itemHash)
                else:
                    itemNode.set("id", payloadId.replace("/", "_"))                  
                manifest.append(itemNode)
                
            if not isImage: 
                navPoint = ElementTree.Element("navPoint", {"class":"chapter", "id":"%s" % itemHash, 
                                                                "playOrder":"%s" % count})
            else:
                navPoint = ElementTree.Element("navPoint", {"class":"chapter", "id":"%s" % htmlFileName, 
                                                                "playOrder":"%s" % count})
            navMap.append(navPoint)
            navLabel = ElementTree.Element("navLabel")
            navPoint.append(navLabel)
            textNode = ElementTree.Element("text")
            textNode.text = title
            navLabel.append(textNode)
            content = ElementTree.Element("content")
            navPoint.append(content)
            content.set("src", htmlFileName)
            count +=1
                
            itemRefNode = ElementTree.Element("itemref")
            spine.append(itemRefNode)
            itemRefNode.set("idref", itemHash)
                
        #saving content.opf...
        zipOutputStream.putNextEntry(ZipEntry("OEBPS/content.opf"))
        self.__copyString(ElementTree.tostring(contentXml), zipOutputStream)
        zipOutputStream.closeEntry()
        
        #saving toc.ncx
        zipOutputStream.putNextEntry(ZipEntry("OEBPS/toc.ncx"))
        self.__copyString(ElementTree.tostring(tocXml), zipOutputStream)
        zipOutputStream.closeEntry()
        
        zipOutputStream.close()
Beispiel #20
0
 def __load_xml__(self):
     xml_file = File(self.xmlfile)
     sr = SAXReader()
     self.xmldoc =  sr.read(xml_file)
Beispiel #21
0
class XMLAlertHandler(AlertHandler):
    ''''Processing class for a single XML File.
    Each XML file is expected to contain only a single Collection
    '''

    def __init__(self, file, config, baseline):
        AlertHandler.__init__(self, file, config, baseline)
        docFactory = DocumentFactory()
        self.saxReader = SAXReader(docFactory)
        self.xmlMapFile = StrSubstitutor.replaceSystemProperties(config['xmlMap'])
        if not os.path.exists(self.xmlMapFile):
            raise AlertException("Requested xmlMap file %s does not exist." % self.xmlMapFile)
        
        ## Make sure we can see our mappings
        inStream = FileInputStream(File(self.xmlMapFile))
        xmlMappings = JsonSimple(inStream)
        self.map = xmlMappings.getObject(["mappings"])
        self.exceptions = xmlMappings.getObject(["exceptions"])
        self.defaultNamespace = xmlMappings.getObject(["defaultNamespace"])
        
        self.mappedExceptionCount = 0
        
        
    def process(self):
        '''Read the XML file and map xpath items to metadata
        Return a list with 1 JsonSimple object (at most)
        '''
        jsonList = []
        data = None
        reader = None
        inStream = None
        document = None
        
        # Run the XML through our parser
        try:
            inStream = FileInputStream(File(self.file))
            reader = InputStreamReader(inStream, "UTF-8")
            document = self.saxReader.read(reader)
        # Parse fails
        except:
            raise
        # Close our file access objects
        finally:
            if reader is not None:
                reader.close()
            if inStream is not None:
                inStream.close()

        # Now go looking for all our data
        data = self.getNewJsonObject()
        self.__mapXpathToFields(document, self.map, data)
        
        if data is None:
            return None
        
        jsonList.append(JsonSimple(data))
        return jsonList


    ## Used recursively
    def __mapXpathToFields(self, sourceData, map, responseData, index = 1):
        for xpath in map.keySet():
            field = map.get(xpath)
            if xpath != "":
                xpathobj = DefaultXPath(xpath)
                if not self.defaultNamespace is None:
                    xpathobj.setNamespaceContext(SimpleNamespaceContext(self.defaultNamespace))
                    
                nodes = xpathobj.selectNodes(sourceData)
                
                if isinstance(field, JsonObject):
                    #The XPath key provides a dictionary containing sub xpath queries mapped to fields
                    i = 1
                    for node in nodes:
                        self.__mapXpathToFields(node, field, responseData, i)
                        i += 1
                else:
                    # Lists indicate we're copying the several fields
                    if isinstance(field, JSONArray):
                        for eachField in field:
                            self.__insertFieldData(nodes, eachField, responseData, index)
                    # or just one field
                    else:
                        self.__insertFieldData(nodes, field, responseData, index)

    def __insertFieldData(self, xmlNodes, field, responseData, index):
        multiValue = False
        multiIndex = 1
        fieldString = ""
        
        if self.exceptions["fields"].containsKey(field):
            #The field is an exception
            excepted = True
            output = self.exceptions["output"]
            self.mappedExceptionCount += 1
        else:
            # Nope, just normal
            excepted = False
            if ('.0.' in field and len(xmlNodes) > 1): 
            #In ReDBox, a field such as dc:subject.vivo:keyword.0.rdf:PlainLiteral indicates a list of values, using the number as a counter.
            #In the code below, if a field contains this number element, we can increment the counter and add more and more. 
            #If there is no number, we just overwrite the value.
                multiValue = True
                #we'll do the fieldString index change a little later
                fieldString = field
            else:
                fieldString = field.replace(".0.", ".%s."%index, 1)
            
        for node in xmlNodes:
            text = node.getTextTrim()
                
            if fieldString != "" and text != "":
                if excepted:
                    exceptionString = "%s: '%s' (%s)" % (exceptions["fields"][field], text, field)
                    responseData.put(fieldString, exceptionString)
                else:
                    if multiValue:
                        fieldString = field.replace(".0.", ".%s."%multiIndex, 1)
                        multiIndex += 1
                    responseData.put(fieldString, text)
Beispiel #22
0
 def __doc_cache_file(self):
     reader = SAXReader()
     cache_doc = reader.read(
         File(self.cache_file)
         )
     return cache_doc
    def __createEpub(self):
        title = self.__manifest.getString(None, "title")

        self.vc("response").setHeader(
            "Content-Disposition",
            "attachment; filename=%s.epub" % urllib.quote(title))
        out = self.vc("response").getOutputStream("application/epub+zip")
        zipOutputStream = ZipOutputStream(out)

        #save mimetype... and the rest of standard files in epub
        zipOutputStream.putNextEntry(ZipEntry("mimetype"))
        epubMimetypeStream = self.__getResourceAsStream("/epub/mimetype")
        IOUtils.copy(epubMimetypeStream, zipOutputStream)
        zipOutputStream.closeEntry()

        zipOutputStream.putNextEntry(ZipEntry("META-INF/container.xml"))
        epubContainerStream = self.__getResourceAsStream("/epub/container.xml")
        IOUtils.copy(epubContainerStream, zipOutputStream)
        zipOutputStream.closeEntry()

        zipOutputStream.putNextEntry(ZipEntry("OEBPS/epub.css"))
        epubcss = self.__getResourceAsStream("/epub/epub.css")
        IOUtils.copy(epubcss, zipOutputStream)
        zipOutputStream.closeEntry()

        #### Creating toc.ncx ####
        tocXml = ElementTree.Element(
            "ncx", {
                "version": "2005-1",
                "xml:lang": "en",
                "xmlns": "http://www.daisy.org/z3986/2005/ncx/"
            })
        headNode = ElementTree.Element("head")
        tocXml.append(headNode)

        headNode.append(
            ElementTree.Element("meta", {
                "name": "dtb:uid",
                "content": "1"
            }))
        headNode.append(
            ElementTree.Element("meta", {
                "name": "dtb:depth",
                "content": "1"
            }))
        headNode.append(
            ElementTree.Element("meta", {
                "name": "dtb:totalPageCount",
                "content": "1"
            }))
        headNode.append(
            ElementTree.Element("meta", {
                "name": "dtb:maxPageNumber",
                "content": "1"
            }))
        headNode.append(
            ElementTree.Element("meta", {
                "name": "dtb:generator",
                "content": "ICE v2"
            }))

        #docTitle
        docTitle = ElementTree.Element("docTitle")
        textNode = ElementTree.Element("text")
        textNode.text = title
        docTitle.append(textNode)
        tocXml.append(docTitle)

        #docAuthor
        docAuthor = ElementTree.Element("docAuthor")
        textNode = ElementTree.Element("text")
        textNode.text = "ICE v2"
        docAuthor.append(textNode)
        tocXml.append(docAuthor)

        #navMap
        navMap = ElementTree.Element("navMap")
        tocXml.append(navMap)

        #### Creating content.opf ####
        contentXml = ElementTree.Element(
            "package", {
                "version": "2.0",
                "xmlns": "http://www.idpf.org/2007/opf",
                "unique-identifier": "BookId"
            })

        metadataNode = ElementTree.Element(
            "metadata", {
                "xmlns:dc": "http://purl.org/dc/elements/1.1/",
                "xmlns:opf": "http://www.idpf.org/2007/opf"
            })
        contentXml.append(metadataNode)

        #metadata information
        metadata = ElementTree.Element("dc:title")
        metadata.text = title
        metadataNode.append(metadata)

        metadata = ElementTree.Element("dc:language")
        metadata.text = "en-AU"
        metadataNode.append(metadata)

        metadata = ElementTree.Element("dc:creator", {"opf:role": "aut"})
        metadata.text = "ICE"
        metadataNode.append(metadata)

        metadata = ElementTree.Element("dc:publisher")
        metadata.text = "University of Southern Queensland"
        metadataNode.append(metadata)

        metadata = ElementTree.Element("dc:identifier", {"id": "BookId"})
        metadata.text = title
        metadataNode.append(metadata)

        #manifest
        manifest = ElementTree.Element("manifest")
        contentXml.append(manifest)

        spine = ElementTree.Element("spine", {"toc": "ncx"})
        contentXml.append(spine)

        item = ElementTree.Element("item", {
            "id": "ncx",
            "href": "toc.ncx",
            "media-type": "text/xml"
        })
        manifest.append(item)
        css = ElementTree.Element("item", {
            "id": "style",
            "href": "epub.css",
            "media-type": "text/css"
        })
        manifest.append(css)

        count = 1
        for itemHash in self.__orderedItem:
            id, title, htmlFileName, payloadDict, isImage = self.__itemRefDict[
                itemHash]

            for payloadId in payloadDict:
                payload, payloadType = payloadDict[payloadId]
                if isinstance(payload, Payload):
                    payloadId = payloadId.lower()
                    zipEntryId = payloadId.replace(" ", "_").replace("\\", "/")
                    if payloadType == "application/xhtml+xml":
                        zipOutputStream.putNextEntry(
                            ZipEntry("OEBPS/%s" % zipEntryId))
                        ##process the html....
                        saxReader = SAXReader(False)
                        try:
                            saxDoc = saxReader.read(payload.open())
                            payload.close()
                            #                            ## remove class or style nodes
                            #                            classOrStyleNodes = saxDoc.selectNodes("//@class | //@style ")
                            #                            for classOrStyleNode in classOrStyleNodes:
                            #                                node = classOrStyleNode
                            #                                if classOrStyleNode.getParent():
                            #                                    node = classOrStyleNode.getParent()
                            #                                if node.getQualifiedName() == "img":
                            #                                    attr = node.attribute(QName("class"))
                            #                                    attr = node.attribute(QName("class"))
                            #                                    if attr:
                            #                                        node.remove(attr)
                            #                                    attr = node.attribute(QName("style"))
                            #                                    if attr:
                            #                                        node.remove(attr)

                            ## remove name in a tags
                            ahrefs = saxDoc.selectNodes(
                                "//*[local-name()='a' and @name!='']")
                            for a in ahrefs:
                                attr = a.attribute(QName("name"))
                                if attr:
                                    a.remove(attr)

                            ## fix images src name.... replace space with underscore and all lower case
                            imgs = saxDoc.selectNodes(
                                "//*[local-name()='img' and contains(@src, '_files')]"
                            )
                            for img in imgs:
                                srcAttr = img.attribute(QName("src"))
                                if srcAttr:
                                    src = srcAttr.getValue()
                                    #hash the sourcename
                                    filepath, filename = os.path.split(src)
                                    filename, ext = os.path.splitext(filename)
                                    filename = hashlib.md5(
                                        filename).hexdigest()
                                    src = os.path.join(
                                        filepath.lower().replace(" ", "_"),
                                        "node-%s%s" % (filename, ext))
                                    img.addAttribute(QName("src"),
                                                     src.replace(" ", "_"))

                            bodyNode = saxDoc.selectSingleNode(
                                "//*[local-name()='div' and @class='body']")
                            bodyNode.setName("div")

                            out = ByteArrayOutputStream()
                            format = OutputFormat.createPrettyPrint()
                            format.setSuppressDeclaration(True)
                            writer = XMLWriter(out, format)
                            writer.write(bodyNode)
                            writer.flush()
                            contentStr = out.toString("UTF-8")

                            htmlString = """<?xml version="1.0" encoding="UTF-8"?>
                                            <html xmlns="http://www.w3.org/1999/xhtml"><head><title>%s</title>
                                            <link rel="stylesheet" href="epub.css"/>
                                            </head><body>%s</body></html>"""
                            htmlString = htmlString % (title, contentStr)
                            self.__copyString(htmlString, zipOutputStream)
                            includeFile = False
                        except:
                            traceback.print_exc()
                    else:
                        #images....
                        zipOutputStream.putNextEntry(
                            ZipEntry("OEBPS/%s" % zipEntryId))
                        IOUtils.copy(payload.open(), zipOutputStream)
                        payload.close()
                        zipOutputStream.closeEntry()
                else:
                    zipOutputStream.putNextEntry(
                        ZipEntry("OEBPS/%s" % zipEntryId))
                    IOUtils.copy(payload, zipOutputStream)
                    zipOutputStream.closeEntry()

                itemNode = ElementTree.Element("item", {
                    "media-type": payloadType,
                    "href": zipEntryId
                })
                if payloadId == htmlFileName.lower():
                    itemNode.set("id", itemHash)
                else:
                    itemNode.set("id", payloadId.replace("/", "_"))
                manifest.append(itemNode)

            if not isImage:
                navPoint = ElementTree.Element(
                    "navPoint", {
                        "class": "chapter",
                        "id": "%s" % itemHash,
                        "playOrder": "%s" % count
                    })
            else:
                navPoint = ElementTree.Element(
                    "navPoint", {
                        "class": "chapter",
                        "id": "%s" % htmlFileName,
                        "playOrder": "%s" % count
                    })
            navMap.append(navPoint)
            navLabel = ElementTree.Element("navLabel")
            navPoint.append(navLabel)
            textNode = ElementTree.Element("text")
            textNode.text = title
            navLabel.append(textNode)
            content = ElementTree.Element("content")
            navPoint.append(content)
            content.set("src", htmlFileName)
            count += 1

            itemRefNode = ElementTree.Element("itemref")
            spine.append(itemRefNode)
            itemRefNode.set("idref", itemHash)

        #saving content.opf...
        zipOutputStream.putNextEntry(ZipEntry("OEBPS/content.opf"))
        self.__copyString(ElementTree.tostring(contentXml), zipOutputStream)
        zipOutputStream.closeEntry()

        #saving toc.ncx
        zipOutputStream.putNextEntry(ZipEntry("OEBPS/toc.ncx"))
        self.__copyString(ElementTree.tostring(tocXml), zipOutputStream)
        zipOutputStream.closeEntry()

        zipOutputStream.close()
Beispiel #24
0
class XMLAlertHandler(AlertHandler):
    ''''Processing class for a single XML File.
    Each XML file is expected to contain only a single Collection
    '''
    def __init__(self, file, config, baseline):
        AlertHandler.__init__(self, file, config, baseline)
        docFactory = DocumentFactory()
        self.saxReader = SAXReader(docFactory)
        self.xmlMapFile = StrSubstitutor.replaceSystemProperties(
            config['xmlMap'])
        if not os.path.exists(self.xmlMapFile):
            raise AlertException("Requested xmlMap file %s does not exist." %
                                 self.xmlMapFile)

        ## Make sure we can see our mappings
        inStream = FileInputStream(File(self.xmlMapFile))
        xmlMappings = JsonSimple(inStream)
        self.map = xmlMappings.getObject(["mappings"])
        self.exceptions = xmlMappings.getObject(["exceptions"])
        self.defaultNamespace = xmlMappings.getObject(["defaultNamespace"])

        self.mappedExceptionCount = 0

    def process(self):
        '''Read the XML file and map xpath items to metadata
        Return a list with 1 JsonSimple object (at most)
        '''
        jsonList = []
        data = None
        reader = None
        inStream = None
        document = None

        # Run the XML through our parser
        try:
            inStream = FileInputStream(File(self.file))
            reader = InputStreamReader(inStream, "UTF-8")
            document = self.saxReader.read(reader)
        # Parse fails
        except:
            raise
        # Close our file access objects
        finally:
            if reader is not None:
                reader.close()
            if inStream is not None:
                inStream.close()

        # Now go looking for all our data
        data = self.getNewJsonObject()
        self.__mapXpathToFields(document, self.map, data)

        if data is None:
            return None

        jsonList.append(JsonSimple(data))
        return jsonList

    ## Used recursively
    def __mapXpathToFields(self, sourceData, map, responseData, index=1):
        for xpath in map.keySet():
            field = map.get(xpath)
            if xpath != "":
                xpathobj = DefaultXPath(xpath)
                if not self.defaultNamespace is None:
                    xpathobj.setNamespaceContext(
                        SimpleNamespaceContext(self.defaultNamespace))

                nodes = xpathobj.selectNodes(sourceData)

                if isinstance(field, JsonObject):
                    #The XPath key provides a dictionary containing sub xpath queries mapped to fields
                    i = 1
                    for node in nodes:
                        self.__mapXpathToFields(node, field, responseData, i)
                        i += 1
                else:
                    # Lists indicate we're copying the several fields
                    if isinstance(field, JSONArray):
                        for eachField in field:
                            self.__insertFieldData(nodes, eachField,
                                                   responseData, index)
                    # or just one field
                    else:
                        self.__insertFieldData(nodes, field, responseData,
                                               index)

    def __insertFieldData(self, xmlNodes, field, responseData, index):
        multiValue = False
        multiIndex = 1
        fieldString = ""

        if self.exceptions["fields"].containsKey(field):
            #The field is an exception
            excepted = True
            output = self.exceptions["output"]
            self.mappedExceptionCount += 1
        else:
            # Nope, just normal
            excepted = False
            if ('.0.' in field and len(xmlNodes) > 1):
                #In ReDBox, a field such as dc:subject.vivo:keyword.0.rdf:PlainLiteral indicates a list of values, using the number as a counter.
                #In the code below, if a field contains this number element, we can increment the counter and add more and more.
                #If there is no number, we just overwrite the value.
                multiValue = True
                #we'll do the fieldString index change a little later
                fieldString = field
            else:
                fieldString = field.replace(".0.", ".%s." % index, 1)

        for node in xmlNodes:
            try:
                text = node.getTextTrim()
            except:
                try:
                    text = node.getValue().strip()
                except:
                    text = node

            if fieldString != "" and text != "":
                if excepted:
                    exceptionString = "%s: '%s' (%s)" % (
                        exceptions["fields"][field], text, field)
                    responseData.put(fieldString, exceptionString)
                else:
                    if multiValue:
                        fieldString = field.replace(".0.", ".%s." % multiIndex,
                                                    1)
                        multiIndex += 1
                    responseData.put(fieldString, text)