def transform(self, data, options=None): if self._validate(data) is None: return None arguments = { 'infile_data_suffix' : '.html' } result = self.prepare_transform(data, arguments=arguments) text = ''.join(result.data).decode('utf-8', 'ignore') result.data = StringIter(html_bodyfinder(text)) return result
def transform(self, data, options=None): if self._validate(data) is None: return None dev = self.dev if options is not None: dev = options.get("output_image_format", self.dev) arguments = {"infile_data_suffix": ".html", "dev": "-dev %s" % dev} result = self.prepare_transform(data, arguments=arguments) if result.data is None: return None text = "".join(result.data).decode("utf-8", "ignore") # workaround because of bug in pdftohtml text = self.fixBrokenStyles(text) result.data = StringIter(html_bodyfinder(text)) return result
def transform(self, data, options=None): if self._validate(data) is None: return None data = u''.join(data) return TransformResult(StringIter(html_bodyfinder(data)))
def extract_output(self, stdout): return StringIter(html_bodyfinder(stdout.read()).decode("utf-8", "ignore"))