Beispiel #1
0
    def html_text(self, fn):

        f = open(fn, 'rb')
        zip = ZipFile(f)

        docdata = b'<html>\n<head>\n<meta http-equiv="Content-Type"' \
                  b'content="text/html; charset=UTF-8">'

        # Wrap metadata extraction because it can sometimes throw
        # while the main text will be valid
        try:
            metadata = zip.read("meta.xml")
            if metadata:
                res = rclxslt.apply_sheet_data(stylesheet_meta, metadata)
                docdata += res
        except:
            # To be checked. I'm under the impression that I get this when
            # nothing matches?
            #self.em.rclog("No/bad metadata in %s" % fn)
            pass

        docdata += b'</head>\n<body>\n'

        content = zip.read("content.xml")
        if content:
            res = rclxslt.apply_sheet_data(stylesheet_content, content)
            docdata += res
        docdata += b'</body></html>'

        return docdata
Beispiel #2
0
    def extractone(self, params):
        if "filename:" not in params:
            self.em.rclog("extractone: no mime or file name")
            return (False, "", "", rclexecm.RclExecM.eofnow)
        fn = params["filename:"]

        try:
            f = open(fn, 'rb')
            zip = ZipFile(f)
        except Exception as err:
            self.em.rclog("unzip failed: " + str(err))
            return (False, "", "", rclexecm.RclExecM.eofnow)

        docdata = b'<html><head>'

        try:
            metadata = zip.read("docProps/core.xml")
            if metadata:
                res = rclxslt.apply_sheet_data(meta_stylesheet, metadata)
                docdata += res
        except Exception as err:
            # To be checked. I'm under the impression that I get this when
            # nothing matches?
            self.em.rclog("no/bad metadata in %s: %s" % (fn, err))
            pass

        docdata += b'</head><body>'

        try:
            content= zip.read('word/document.xml')
            stl = self.computestylesheet('word')
            docdata += rclxslt.apply_sheet_data(stl, content)
        except:
            pass

        try:
            content = zip.read('xl/sharedStrings.xml')
            stl = self.computestylesheet('xl')
            docdata += rclxslt.apply_sheet_data(stl, content)
        except:
            pass

        try:
            stl = self.computestylesheet('pp')
            # Note that we'd need a numeric sort really (else we get slide1
            # slide11 slide2)
            for fn in sorted(zip.namelist()):
                if fnmatch.fnmatch(fn, 'ppt/slides/slide*.xml'):
                    content = zip.read(fn)
                    docdata += rclxslt.apply_sheet_data(stl, content)
        except:
            pass

        docdata += b'</body></html>'

        return (True, docdata, "", rclexecm.RclExecM.eofnext)
Beispiel #3
0
    def extractone(self, params):
        if "filename:" not in params:
            self.em.rclog("extractone: no mime or file name")
            return (False, "", "", rclexecm.RclExecM.eofnow)
        fn = params["filename:"]

        try:
            f = open(fn, 'rb')
            zip = ZipFile(f)
        except Exception as err:
            self.em.rclog("unzip failed: " + str(err))
            return (False, "", "", rclexecm.RclExecM.eofnow)

        docdata = b'<html><head>'

        try:
            metadata = zip.read("docProps/core.xml")
            if metadata:
                res = rclxslt.apply_sheet_data(meta_stylesheet, metadata)
                docdata += res
        except Exception as err:
            # To be checked. I'm under the impression that I get this when
            # nothing matches?
            self.em.rclog("no/bad metadata in %s: %s" % (fn, err))
            pass

        docdata += b'</head><body>'

        try:
            content= zip.read('word/document.xml')
            stl = self.computestylesheet('word')
            docdata += rclxslt.apply_sheet_data(stl, content)
        except:
            pass

        try:
            content = zip.read('xl/sharedStrings.xml')
            stl = self.computestylesheet('xl')
            docdata += rclxslt.apply_sheet_data(stl, content)
        except:
            pass

        try:
            stl = self.computestylesheet('pp')
            # Note that we'd need a numeric sort really (else we get slide1
            # slide11 slide2)
            for fn in sorted(zip.namelist()):
                if fnmatch.fnmatch(fn, 'ppt/slides/slide*.xml'):
                    content = zip.read(fn)
                    docdata += rclxslt.apply_sheet_data(stl, content)
        except:
            pass

        docdata += b'</body></html>'

        return (True, docdata, "", rclexecm.RclExecM.eofnext)
Beispiel #4
0
    def html_text(self, fn):

        f = open(fn, 'rb')
        zip = ZipFile(f)

        docdata = b'<html><head>'

        try:
            metadata = zip.read("docProps/core.xml")
            if metadata:
                res = rclxslt.apply_sheet_data(meta_stylesheet, metadata)
                docdata += res
        except Exception as err:
            pass

        docdata += b'</head><body>'

        try:
            content = zip.read('word/document.xml')
            stl = self.computestylesheet('word')
            docdata += rclxslt.apply_sheet_data(stl, content)
        except:
            pass

        try:
            content = zip.read('xl/sharedStrings.xml')
            stl = self.computestylesheet('xl')
            docdata += rclxslt.apply_sheet_data(stl, content)
        except:
            pass

        try:
            stl = self.computestylesheet('pp')
            # Note that we'd need a numeric sort really (else we get slide1
            # slide11 slide2)
            for fn in sorted(zip.namelist()):
                if fnmatch.fnmatch(fn, 'ppt/slides/slide*.xml'):
                    content = zip.read(fn)
                    docdata += rclxslt.apply_sheet_data(stl, content)
        except:
            pass

        docdata += b'</body></html>'

        return docdata
Beispiel #5
0
    def extractone(self, params):
        if "filename:" not in params:
            self.em.rclog("extractone: no mime or file name")
            return (False, "", "", rclexecm.RclExecM.eofnow)
        fn = params["filename:"]

        try:
            zip = ZipFile(fn.decode('UTF-8'))
        except Exception as err:
            self.em.rclog("unzip failed: %s" % err)
            return (False, "", "", rclexecm.RclExecM.eofnow)

        docdata = b'<html>\n<head>\n<meta http-equiv="Content-Type"' \
                  b'content="text/html; charset=UTF-8">'

        try:
            metadata = zip.read("meta.xml")
            if metadata:
                res = rclxslt.apply_sheet_data(stylesheet_meta, metadata)
                docdata += res
        except:
            # To be checked. I'm under the impression that I get this when
            # nothing matches?
            #self.em.rclog("no/bad metadata in %s" % fn)
            pass

        docdata += b'</head>\n<body>\n'

        try:
            content = zip.read("content.xml")
            if content:
                res = rclxslt.apply_sheet_data(stylesheet_content, content)
                docdata += res
            docdata += b'</body></html>'
        except Exception as err:
            self.em.rclog("bad data in %s: %s" % (fn, err))
            return (False, "", "", rclexecm.RclExecM.eofnow)

        return (True, docdata, "", rclexecm.RclExecM.eofnext)
Beispiel #6
0
    def extractone(self, params):
        if "filename:" not in params:
            self.em.rclog("extractone: no mime or file name")
            return (False, "", "", rclexecm.RclExecM.eofnow)
        fn = params["filename:"]

        try:
            data = open(fn, 'rb').read()
            docdata = rclxslt.apply_sheet_data(stylesheet_all, data)
        except Exception as err:
            self.em.rclog("%s: bad data: " % (fn, err))
            return (False, "", "", rclexecm.RclExecM.eofnow)

        return (True, docdata, "", rclexecm.RclExecM.eofnext)
Beispiel #7
0
    def extractone(self, params):
        if "filename:" not in params:
            self.em.rclog("extractone: no mime or file name")
            return (False, "", "", rclexecm.RclExecM.eofnow)
        fn = params["filename:"]

        try:
            data = open(fn, 'rb').read()
            docdata = rclxslt.apply_sheet_data(stylesheet_all, data)
        except Exception as err:
            self.em.rclog("%s: bad data: " % (fn, err))
            return (False, "", "", rclexecm.RclExecM.eofnow)

        return (True, docdata, "", rclexecm.RclExecM.eofnext)
Beispiel #8
0
    def extractone(self, params):
        if "filename:" not in params:
            self.em.rclog("extractone: no mime or file name")
            return (False, "", "", rclexecm.RclExecM.eofnow)
        fn = params["filename:"]

        try:
            f = open(fn, 'rb')
            data = f.read()
            f.close()
        except Exception as err:
            self.em.rclog("open failed: %s" % err)
            return (False, "", "", rclexecm.RclExecM.eofnow)

        docdata = b'<html>\n<head>\n<meta http-equiv="Content-Type"' \
                  b'content="text/html; charset=UTF-8">\n'

        try:
            res = rclxslt.apply_sheet_data(stylesheet_meta, data)
            docdata += res
        except:
            # To be checked. I'm under the impression that I get this when
            # nothing matches?
            #self.em.rclog("no/bad metadata in %s" % fn)
            pass

        docdata += b'</head><body>'

        try:
            res = rclxslt.apply_sheet_data(stylesheet_content, data)
            docdata += res
            docdata += b'</body></html>'
        except Exception as err:
            self.em.rclog("bad data in %s: %s" % (fn, err))
            return (False, "", "", rclexecm.RclExecM.eofnow)

        return (True, docdata, "", rclexecm.RclExecM.eofnext)
Beispiel #9
0
    def extractone(self, params):
        if "filename:" not in params:
            self.em.rclog("extractone: no mime or file name")
            return (False, "", "", rclexecm.RclExecM.eofnow)
        fn = params["filename:"]

        try:
            zip = ZipFile(fn.decode('UTF-8'))
        except Exception as err:
            self.em.rclog("unzip failed: %s" % err)
            return (False, "", "", rclexecm.RclExecM.eofnow)

        docdata = b'<html><head><meta http-equiv="Content-Type"' \
                  b'content="text/html; charset=UTF-8"></head><body>'

        try:
            metadata = zip.read("meta.xml")
            if metadata:
                res = rclxslt.apply_sheet_data(stylesheet_meta, metadata)
                docdata += res
        except:
            # To be checked. I'm under the impression that I get this when
            # nothing matches?
            #self.em.rclog("no/bad metadata in %s" % fn)
            pass

        try:
            content = zip.read("content.xml")
            if content:
                res = rclxslt.apply_sheet_data(stylesheet_content, content)
                docdata += res
            docdata += b'</body></html>'
        except Exception as err:
            self.em.rclog("bad data in %s: %s" % (fn, err))
            return (False, "", "", rclexecm.RclExecM.eofnow)

        return (True, docdata, "", rclexecm.RclExecM.eofnext)
Beispiel #10
0
    def html_text(self, fn):

        f = open(fn, 'rb')
        zip = ZipFile(f)

        docdata = b'<html><head>'

        try:
            metadata = zip.read("docProps/core.xml")
            if metadata:
                res = rclxslt.apply_sheet_data(meta_stylesheet, metadata)
                docdata += res
        except Exception as err:
            pass

        docdata += b'</head><body>'

        try:
            content = zip.read('word/document.xml')
            stl = self.computestylesheet('word')
            docdata += rclxslt.apply_sheet_data(stl, content)
        except:
            pass

        try:
            content = zip.read('xl/sharedStrings.xml')
            stl = self.computestylesheet('xl')
            docdata += rclxslt.apply_sheet_data(stl, content)
        except:
            pass

        try:
            stl = None
            # Extract number suffix for numeric sort
            prefix = "ppt/slides/slide"
            exp = prefix + '[0-9]+' + '.xml'
            names = [fn for fn in zip.namelist() if re.match(exp, fn)]
            for fn in sorted(names,
                             key=lambda e, prefix=prefix: int(e[len(prefix):
                                                                len(e) - 4])):
                if stl is None:
                    stl = self.computestylesheet('pp')
                content = zip.read(fn)
                docdata += rclxslt.apply_sheet_data(stl, content)
        except Exception as ex:
            #self.em.rclog("PPT Exception: %s" % ex)
            pass

        try:
            stl = None
            # Extract number suffix for numeric sort
            prefix = 'visio/pages/page'
            exp = prefix + '[0-9]+' + '.xml'
            names = [fn for fn in zip.namelist() if re.match(exp, fn)]
            for fn in sorted(names,
                             key=lambda e, prefix=prefix: int(e[len(prefix):
                                                                len(e) - 4])):
                if stl is None:
                    stl = self.computestylesheet('vs')
                content = zip.read(fn)
                docdata += rclxslt.apply_sheet_data(stl, content)
        except Exception as ex:
            #self.em.rclog("VISIO Exception: %s" % ex)
            pass

        docdata += b'</body></html>'

        return docdata
Beispiel #11
0
 def html_text(self, fn):
     if self.dogz:
         data = gzip.open(fn, 'rb').read()
     else:
         data = open(fn, 'rb').read()
     return rclxslt.apply_sheet_data(self.stylesheet, data)