def transform(self, data, options=None):
        if self._validate(data) is None:
            return None

        arguments = { 'infile_data_suffix' : '.html' }

        result = self.prepare_transform(data, arguments=arguments)
        text = ''.join(result.data).decode('utf-8', 'ignore')
        result.data = StringIter(html_bodyfinder(text))
        return result
Example #2
0
    def transform(self, data, options=None):
        if self._validate(data) is None:
            return None

        dev = self.dev
        if options is not None:
            dev = options.get("output_image_format", self.dev)

        arguments = {"infile_data_suffix": ".html", "dev": "-dev %s" % dev}

        result = self.prepare_transform(data, arguments=arguments)
        if result.data is None:
            return None

        text = "".join(result.data).decode("utf-8", "ignore")
        # workaround because of bug in pdftohtml
        text = self.fixBrokenStyles(text)
        result.data = StringIter(html_bodyfinder(text))
        return result
    def transform(self, data, options=None):
        if self._validate(data) is None:
            return None

        data = u''.join(data)
        return TransformResult(StringIter(html_bodyfinder(data)))
Example #4
0
 def extract_output(self, stdout):
     return StringIter(html_bodyfinder(stdout.read()).decode("utf-8", "ignore"))