def _html(self):
     htmlfile = open(pjoin(self.tmpdir, self.__name__+".html"), 'r')
     html = htmlfile.read()
     if process_double_encoding:
         html = noDoubleEncoding(html)
     htmlfile.close()
     html = scrubHTML(html)
     body = bodyfinder(html)
     return body
Exemple #2
0
 def _html(self):
     htmlfile = open(pjoin(self.tmpdir, self.__name__+".html"), 'r')
     html = htmlfile.read()
     if process_double_encoding:
         html = noDoubleEncoding(html)
     htmlfile.close()
     html = scrubHTML(html)
     body = bodyfinder(html)
     return body
 def _html(self):
     try:
         htmlfile = open(pjoin(self.tmpdir, self.__name__+".html"), 'r')
         html = htmlfile.read()
     except IOError:
         return ""
     if process_double_encoding :
         html = noDoubleEncoding(html)
     htmlfile.close()
     #xlhtml gives verry complex html ; scrubHTML takes soooo long !
     #html = scrubHTML(html)
     body = bodyfinder(html)
     return body
 def _html(self):
     try:
         htmlfile = open(pjoin(self.tmpdir, self.__name__+".html"), 'r')
         html = htmlfile.read()
     except IOError:
         return ""
     htmlfile.close()
     if process_double_encoding :
         # This operation can be very memory-consuming ...
         try:
             html = noDoubleEncoding(html)
         except MemoryError:
             return ""
     #xlhtml gives verry complex html ; scrubHTML takes soooo long !
     #html = scrubHTML(html)
     body = bodyfinder(html)
     return body
Exemple #5
0
 def _html(self):
     try:
         htmlfile = open(pjoin(self.tmpdir, self.__name__ + ".html"), 'r')
         html = htmlfile.read()
     except IOError:
         return ""
     htmlfile.close()
     if process_double_encoding:
         # This operation can be very memory-consuming ...
         try:
             html = noDoubleEncoding(html)
         except MemoryError:
             return ""
     #xlhtml gives verry complex html ; scrubHTML takes soooo long !
     #html = scrubHTML(html)
     body = bodyfinder(html)
     return body
    def convert(self, data, cache, **kwargs):
        if 'filename' not in kwargs or not kwargs['filename']:
            kwargs['filename'] = 'unknown.pdf'
        
        tmpdir, fullname = self.initialize_tmpdir(data, **kwargs)
        html = self.invokeCommand(tmpdir, fullname)
        html = fixBrokenStyles(html)
        if process_double_encoding :
            html = noDoubleEncoding(html)

        path, images = self.subObjects(tmpdir)
        objects = {}
        if images:
            self.fixImages(path, images, objects)
        self.cleanDir(tmpdir)
        cache.setData(bodyfinder(html).decode('utf-8','replace').encode('utf-8'))
        cache.setSubObjects(objects)
        return cache
Exemple #7
0
    def convert(self, data, cache, **kwargs):
        if 'filename' not in kwargs or not kwargs['filename']:
            kwargs['filename'] = 'unknown.pdf'

        tmpdir, fullname = self.initialize_tmpdir(data, **kwargs)
        html = self.invokeCommand(tmpdir, fullname)
        html = fixBrokenStyles(html)
        if process_double_encoding:
            html = noDoubleEncoding(html)

        path, images = self.subObjects(tmpdir)
        objects = {}
        if images:
            self.fixImages(path, images, objects)
        self.cleanDir(tmpdir)
        cache.setData(
            bodyfinder(html).decode('utf-8', 'replace').encode('utf-8'))
        cache.setSubObjects(objects)
        return cache