def _html(self): htmlfile = open(pjoin(self.tmpdir, self.__name__+".html"), 'r') html = htmlfile.read() if process_double_encoding: html = noDoubleEncoding(html) htmlfile.close() html = scrubHTML(html) body = bodyfinder(html) return body
def _html(self): try: htmlfile = open(pjoin(self.tmpdir, self.__name__+".html"), 'r') html = htmlfile.read() except IOError: return "" if process_double_encoding : html = noDoubleEncoding(html) htmlfile.close() #xlhtml gives verry complex html ; scrubHTML takes soooo long ! #html = scrubHTML(html) body = bodyfinder(html) return body
def _html(self): try: htmlfile = open(pjoin(self.tmpdir, self.__name__+".html"), 'r') html = htmlfile.read() except IOError: return "" htmlfile.close() if process_double_encoding : # This operation can be very memory-consuming ... try: html = noDoubleEncoding(html) except MemoryError: return "" #xlhtml gives verry complex html ; scrubHTML takes soooo long ! #html = scrubHTML(html) body = bodyfinder(html) return body
def _html(self): try: htmlfile = open(pjoin(self.tmpdir, self.__name__ + ".html"), 'r') html = htmlfile.read() except IOError: return "" htmlfile.close() if process_double_encoding: # This operation can be very memory-consuming ... try: html = noDoubleEncoding(html) except MemoryError: return "" #xlhtml gives verry complex html ; scrubHTML takes soooo long ! #html = scrubHTML(html) body = bodyfinder(html) return body
def convert(self, data, cache, **kwargs): if 'filename' not in kwargs or not kwargs['filename']: kwargs['filename'] = 'unknown.pdf' tmpdir, fullname = self.initialize_tmpdir(data, **kwargs) html = self.invokeCommand(tmpdir, fullname) html = fixBrokenStyles(html) if process_double_encoding : html = noDoubleEncoding(html) path, images = self.subObjects(tmpdir) objects = {} if images: self.fixImages(path, images, objects) self.cleanDir(tmpdir) cache.setData(bodyfinder(html).decode('utf-8','replace').encode('utf-8')) cache.setSubObjects(objects) return cache
def convert(self, data, cache, **kwargs): if 'filename' not in kwargs or not kwargs['filename']: kwargs['filename'] = 'unknown.pdf' tmpdir, fullname = self.initialize_tmpdir(data, **kwargs) html = self.invokeCommand(tmpdir, fullname) html = fixBrokenStyles(html) if process_double_encoding: html = noDoubleEncoding(html) path, images = self.subObjects(tmpdir) objects = {} if images: self.fixImages(path, images, objects) self.cleanDir(tmpdir) cache.setData( bodyfinder(html).decode('utf-8', 'replace').encode('utf-8')) cache.setSubObjects(objects) return cache