def html(self): htmlfile = open(self.fullname + '.htm', 'r') html = htmlfile.read() htmlfile.close() html = SafeHTML().scrub_html(html) body = bodyfinder(html) return body
def html(self): htmlfile = open(self.fullname + '.htm', 'r') html = htmlfile.read() htmlfile.close() html = SafeHTML().scrub_html(html) body = bodyfinder(html) return body
def html(self): htmlfile = open("%s/%s.html" % (self.tmpdir, self.__name__), 'r') html = htmlfile.read() htmlfile.close() html = scrubHTMLNoRaise(html) body = bodyfinder(html) return body
def html(self): htmlfile = open(self.fullname + '.htm', 'r') html = htmlfile.read() htmlfile.close() html = scrubHTMLNoRaise(html) body = bodyfinder(html) return body
def html(self): htmlfile = open(self.outputfile, 'r') html = htmlfile.read() htmlfile.close() html = scrubHTML(html) body = bodyfinder(html) return body
def html(self): htmlfile = open(self.fullname + '.htm', 'r') html = htmlfile.read() htmlfile.close() html = scrubHTMLNoRaise(html) body = bodyfinder(html) return body
def html(self): htmlfile = open("%s/%s.html" % (self.tmpdir, self.__name__), 'r') html = htmlfile.read() htmlfile.close() html = scrubHTML(html) body = bodyfinder(html) return body
def html(self): htmlfile = open(self.outputfile, "r") html = htmlfile.read() htmlfile.close() html = scrubHTMLNoRaise(html) body = bodyfinder(html) return body
def html(self): htmlfile = open(self.fullname + ".htm", "r") html = htmlfile.read() htmlfile.close() html = scrubHTML(html) body = bodyfinder(html) return body
def _html(self): htmlfile = open(pjoin(self.tmpdir, self.__name__+".html"), 'r') html = htmlfile.read() htmlfile.close() #html = scrubHTML(html) body = bodyfinder(html) body = xmltag + body return body
def _html(self): htmlfile = open(pjoin(self.tmpdir, self.__name__ + ".html"), 'r') html = htmlfile.read() htmlfile.close() #html = scrubHTML(html) body = bodyfinder(html) body = xmltag + body return body
def _html(self): htmlfile = open(pjoin(self.tmpdir, self.__name__+".html"), 'r') html = htmlfile.read() if process_double_encoding: html = noDoubleEncoding(html) htmlfile.close() html = scrubHTML(html) body = bodyfinder(html) return body
def _html(self): htmlfile = open(pjoin(self.tmpdir, self.__name__+".html"), 'r') html = htmlfile.read() if process_double_encoding: html = noDoubleEncoding(html) htmlfile.close() html = scrubHTML(html) body = bodyfinder(html) return body
class document(commandtransform): file_ext = '' def __init__(self, name, data, exec_prefix=None): """ Initialization: create tmp work directory and copy the document into a file""" tika_path = 'tika' if exec_prefix is not None: tika_path = os.path.join(exec_prefix, 'tika-bin') if not os.path.exists(tika_path): tika_path = os.path.join(exec_prefix, 'tika') if not os.path.exists(tika_path): log.warn('no tika-bin or tika found in exec-prefix: %s' % tika_path) tika_path = 'tika' commandtransform.__init__(self, name, binary=tika_path) name = self.name() if not name.endswith(self.file_ext): name = name + self.file_ext self.tmpdir, self.fullname = self.initialize_tmpdir(data, filename=name) def convert(self): "Convert the document" tmpdir = self.tmpdir htmlfile = open("%s/%s.html" % (self.tmpdir, self.__name__), 'w') # for windows, install wvware from GnuWin32 at C:\Program Files\GnuWin32\bin # you can use: # wvware.exe -c ..\share\wv\wvHtml.xml --charset=utf-8 -d d:\temp d:\temp\test.doc > test.html if os.name == 'posix': try: subprocess.check_call([self.binary, self.fullname], stdout=htmlfile, cwd=tmpdir) except subprocess.CalledProcessError as cpe: log.warn('Could not transform %s: %s' % (self.fullname, cpe)) htmlfile.close() def html(self): htmlfile = open("%s/%s.html" % (self.tmpdir, self.__name__), 'r') html = htmlfile.read() htmlfile.close() html = safe_unicode(html) try: html = laundryutils.sanitize(html, HTMLCleaner) except Exception, err: html = '' # scrubHTML is EVIL, takes ages! #html = scrubHTML(html) body = bodyfinder(html) body = body.encode('utf-8') return body
def convert(self, data, cache, **kwargs): tmp = NamedTemporaryFile() tmp.write(data) tmp.flush() cmd = ['xlhtml', tmp.name] p = Popen(cmd, stdout=PIPE, stderr=PIPE) stdout, stderr = p.communicate() body = bodyfinder(stdout) cache.setData(body) return cache
def convert(self, data, cache, **kwargs): kwargs['filename'] = 'unknown.pdf' tmpdir, fullname = self.initialize_tmpdir(data, **kwargs) html = self.invokeCommand(tmpdir, fullname) path, images = self.subObjects(tmpdir) objects = {} if images: self.fixImages(path, images, objects) self.cleanDir(tmpdir) cache.setData(bodyfinder(html)) cache.setSubObjects(objects) return cache
def _html(self): try: htmlfile = open(pjoin(self.tmpdir, self.__name__+".html"), 'r') html = htmlfile.read() except IOError: return "" if process_double_encoding : html = noDoubleEncoding(html) htmlfile.close() #xlhtml gives verry complex html ; scrubHTML takes soooo long ! #html = scrubHTML(html) body = bodyfinder(html) return body
def convert(self, data, cache, **kwargs): kwargs['filename'] = 'unknow.rtf' tmpdir, fullname = self.initialize_tmpdir(data, **kwargs) html = self.invokeCommand(tmpdir, fullname) path, images = self.subObjects(tmpdir) objects = {} if images: self.fixImages(path, images, objects) self.cleanDir(tmpdir) cache.setData(bodyfinder(html)) cache.setSubObjects(objects) return cache
def get_text_from_view(view_name): """ Text get from a browser view template <body> tag """ portal = api.portal.get() request = getattr(portal, "REQUEST", None) if request is not None: view = api.content.get_view(name=view_name, context=portal, request=request) if view is not None: text = bodyfinder(view.index()).strip() if not isinstance(text, text_type): text = text.decode("utf-8") return text return ""
def _html(self): try: htmlfile = open(pjoin(self.tmpdir, self.__name__ + ".html"), 'r') html = htmlfile.read() except IOError: return "" htmlfile.close() if process_double_encoding: # This operation can be very memory-consuming ... try: html = noDoubleEncoding(html) except MemoryError: return "" #xlhtml gives verry complex html ; scrubHTML takes soooo long ! #html = scrubHTML(html) body = bodyfinder(html) return body
def _html(self): try: htmlfile = open(pjoin(self.tmpdir, self.__name__+".html"), 'r') html = htmlfile.read() except IOError: return "" htmlfile.close() if process_double_encoding : # This operation can be very memory-consuming ... try: html = noDoubleEncoding(html) except MemoryError: return "" #xlhtml gives verry complex html ; scrubHTML takes soooo long ! #html = scrubHTML(html) body = bodyfinder(html) return body
def get_default_text(context): """ Text get from a browser view template <body> tag """ portal = api.portal.get() view_name = 'default_gdpr_text' request = getattr(portal, 'REQUEST', None) if request is not None: view = api.content.get_view( name=view_name, context=portal, request=request ) if view is not None: text = bodyfinder(view.index()).strip() if not isinstance(text, unicode): text = text.decode("utf-8") return text return u''
def getData(self, couterr): return bodyfinder(couterr.read())
def getData(self, couterr): return bodyfinder(couterr.read())
def convert(self, orig, data, **kwargs): body = bodyfinder(orig) data.setData(body) return data
def convert(self, orig, data, **kwargs): body = bodyfinder(orig) data.setData(body) return data