def html(self):
     htmlfile = open(self.fullname + '.htm', 'r')
     html = htmlfile.read()
     htmlfile.close()
     html = SafeHTML().scrub_html(html)
     body = bodyfinder(html)
     return body
 def html(self):
     htmlfile = open(self.fullname + '.htm', 'r')
     html = htmlfile.read()
     htmlfile.close()
     html = SafeHTML().scrub_html(html)
     body = bodyfinder(html)
     return body
Example #3
0
 def html(self):
     htmlfile = open("%s/%s.html" % (self.tmpdir, self.__name__), 'r')
     html = htmlfile.read()
     htmlfile.close()
     html = scrubHTMLNoRaise(html)
     body = bodyfinder(html)
     return body
Example #4
0
 def html(self):
     htmlfile = open(self.fullname + '.htm', 'r')
     html = htmlfile.read()
     htmlfile.close()
     html = scrubHTMLNoRaise(html)
     body = bodyfinder(html)
     return body
Example #5
0
 def html(self):
     htmlfile = open(self.outputfile, 'r')
     html = htmlfile.read()
     htmlfile.close()
     html = scrubHTML(html)
     body = bodyfinder(html)
     return body
Example #6
0
 def html(self):
     htmlfile = open(self.fullname + '.htm', 'r')
     html = htmlfile.read()
     htmlfile.close()
     html = scrubHTMLNoRaise(html)
     body = bodyfinder(html)
     return body
 def html(self):
     htmlfile = open("%s/%s.html" % (self.tmpdir, self.__name__), 'r')
     html = htmlfile.read()
     htmlfile.close()
     html = scrubHTML(html)
     body = bodyfinder(html)
     return body
Example #8
0
 def html(self):
     htmlfile = open(self.outputfile, "r")
     html = htmlfile.read()
     htmlfile.close()
     html = scrubHTMLNoRaise(html)
     body = bodyfinder(html)
     return body
 def html(self):
     htmlfile = open(self.fullname + ".htm", "r")
     html = htmlfile.read()
     htmlfile.close()
     html = scrubHTML(html)
     body = bodyfinder(html)
     return body
 def _html(self):
     htmlfile = open(pjoin(self.tmpdir, self.__name__+".html"), 'r')
     html = htmlfile.read()
     htmlfile.close()
     #html = scrubHTML(html)
     body = bodyfinder(html)
     body = xmltag + body
     return body
 def _html(self):
     htmlfile = open(pjoin(self.tmpdir, self.__name__ + ".html"), 'r')
     html = htmlfile.read()
     htmlfile.close()
     #html = scrubHTML(html)
     body = bodyfinder(html)
     body = xmltag + body
     return body
 def _html(self):
     htmlfile = open(pjoin(self.tmpdir, self.__name__+".html"), 'r')
     html = htmlfile.read()
     if process_double_encoding:
         html = noDoubleEncoding(html)
     htmlfile.close()
     html = scrubHTML(html)
     body = bodyfinder(html)
     return body
Example #13
0
 def _html(self):
     htmlfile = open(pjoin(self.tmpdir, self.__name__+".html"), 'r')
     html = htmlfile.read()
     if process_double_encoding:
         html = noDoubleEncoding(html)
     htmlfile.close()
     html = scrubHTML(html)
     body = bodyfinder(html)
     return body
Example #14
0
class document(commandtransform):

    file_ext = ''

    def __init__(self, name, data, exec_prefix=None):
        """ Initialization: create tmp work directory and copy the
        document into a file"""
        tika_path = 'tika'
        if exec_prefix is not None:
            tika_path = os.path.join(exec_prefix, 'tika-bin')
            if not os.path.exists(tika_path):
                tika_path = os.path.join(exec_prefix, 'tika')
                if not os.path.exists(tika_path):
                    log.warn('no tika-bin or tika found in exec-prefix: %s' %
                             tika_path)
                    tika_path = 'tika'

        commandtransform.__init__(self, name, binary=tika_path)
        name = self.name()
        if not name.endswith(self.file_ext):
            name = name + self.file_ext
        self.tmpdir, self.fullname = self.initialize_tmpdir(data,
                                                            filename=name)

    def convert(self):
        "Convert the document"
        tmpdir = self.tmpdir
        htmlfile = open("%s/%s.html" % (self.tmpdir, self.__name__), 'w')

        # for windows, install wvware from GnuWin32 at C:\Program Files\GnuWin32\bin
        # you can use:
        # wvware.exe -c ..\share\wv\wvHtml.xml --charset=utf-8 -d d:\temp d:\temp\test.doc > test.html

        if os.name == 'posix':
            try:
                subprocess.check_call([self.binary, self.fullname],
                                      stdout=htmlfile,
                                      cwd=tmpdir)
            except subprocess.CalledProcessError as cpe:
                log.warn('Could not transform %s: %s' % (self.fullname, cpe))
        htmlfile.close()

    def html(self):
        htmlfile = open("%s/%s.html" % (self.tmpdir, self.__name__), 'r')
        html = htmlfile.read()
        htmlfile.close()
        html = safe_unicode(html)
        try:
            html = laundryutils.sanitize(html, HTMLCleaner)
        except Exception, err:
            html = ''
        # scrubHTML is EVIL, takes ages!
        #html = scrubHTML(html)
        body = bodyfinder(html)
        body = body.encode('utf-8')
        return body
 def convert(self, data, cache, **kwargs):
     tmp = NamedTemporaryFile()
     tmp.write(data)
     tmp.flush()
     cmd = ['xlhtml', tmp.name]
     p = Popen(cmd, stdout=PIPE, stderr=PIPE)
     stdout, stderr = p.communicate()
     body = bodyfinder(stdout)
     cache.setData(body)
     return cache
Example #16
0
    def convert(self, data, cache, **kwargs):
        kwargs['filename'] = 'unknown.pdf'

        tmpdir, fullname = self.initialize_tmpdir(data, **kwargs)
        html = self.invokeCommand(tmpdir, fullname)
        path, images = self.subObjects(tmpdir)
        objects = {}
        if images:
            self.fixImages(path, images, objects)
        self.cleanDir(tmpdir)
        cache.setData(bodyfinder(html))
        cache.setSubObjects(objects)
        return cache
 def _html(self):
     try:
         htmlfile = open(pjoin(self.tmpdir, self.__name__+".html"), 'r')
         html = htmlfile.read()
     except IOError:
         return ""
     if process_double_encoding :
         html = noDoubleEncoding(html)
     htmlfile.close()
     #xlhtml gives verry complex html ; scrubHTML takes soooo long !
     #html = scrubHTML(html)
     body = bodyfinder(html)
     return body
Example #18
0
    def convert(self, data, cache, **kwargs):
        kwargs['filename'] = 'unknow.rtf'

        tmpdir, fullname = self.initialize_tmpdir(data, **kwargs)
        html = self.invokeCommand(tmpdir, fullname)
        path, images = self.subObjects(tmpdir)
        objects = {}
        if images:
            self.fixImages(path, images, objects)
        self.cleanDir(tmpdir)
        cache.setData(bodyfinder(html))
        cache.setSubObjects(objects)
        return cache
Example #19
0
def get_text_from_view(view_name):
    """
    Text get from a browser view template <body> tag
    """
    portal = api.portal.get()
    request = getattr(portal, "REQUEST", None)
    if request is not None:
        view = api.content.get_view(name=view_name, context=portal, request=request)
        if view is not None:
            text = bodyfinder(view.index()).strip()
            if not isinstance(text, text_type):
                text = text.decode("utf-8")
            return text
    return ""
Example #20
0
 def _html(self):
     try:
         htmlfile = open(pjoin(self.tmpdir, self.__name__ + ".html"), 'r')
         html = htmlfile.read()
     except IOError:
         return ""
     htmlfile.close()
     if process_double_encoding:
         # This operation can be very memory-consuming ...
         try:
             html = noDoubleEncoding(html)
         except MemoryError:
             return ""
     #xlhtml gives verry complex html ; scrubHTML takes soooo long !
     #html = scrubHTML(html)
     body = bodyfinder(html)
     return body
 def _html(self):
     try:
         htmlfile = open(pjoin(self.tmpdir, self.__name__+".html"), 'r')
         html = htmlfile.read()
     except IOError:
         return ""
     htmlfile.close()
     if process_double_encoding :
         # This operation can be very memory-consuming ...
         try:
             html = noDoubleEncoding(html)
         except MemoryError:
             return ""
     #xlhtml gives verry complex html ; scrubHTML takes soooo long !
     #html = scrubHTML(html)
     body = bodyfinder(html)
     return body
Example #22
0
def get_default_text(context):
    """
    Text get from a browser view template <body> tag
    """
    portal = api.portal.get()
    view_name = 'default_gdpr_text'
    request = getattr(portal, 'REQUEST', None)
    if request is not None:
        view = api.content.get_view(
            name=view_name,
            context=portal,
            request=request
        )
        if view is not None:
            text = bodyfinder(view.index()).strip()
            if not isinstance(text, unicode):
                text = text.decode("utf-8")
            return text
    return u''
Example #23
0
 def getData(self, couterr):
     return bodyfinder(couterr.read())
Example #24
0
 def getData(self, couterr):
     return bodyfinder(couterr.read())
Example #25
0
 def convert(self, orig, data, **kwargs):
     body = bodyfinder(orig)
     data.setData(body)
     return data
Example #26
0
 def convert(self, orig, data, **kwargs):
     body = bodyfinder(orig)
     data.setData(body)
     return data