def pdf_convert2(self, doc, encoding, mimetype): """Convert pdf data to raw text""" tmp_name = self.saveFile(doc) if sys.platform == 'win32': html = self.execute('pdftohtml -stdout -i -noframes "%s"' % tmp_name) if encoding: return html2text(html, ignore_tags=('img',), indent_width=4, page_width=80), encoding else: return html2text(html, ignore_tags=('img',), indent_width=4, page_width=80), 'utf-8' else: if encoding: return self.execute('pdftotext -enc UTF-8 "%s" -' % tmp_name), encoding return self.execute('pdftotext -enc UTF-8 "%s" -' % tmp_name), 'utf-8'
def pdf_convert2(self, doc, encoding, mimetype): """Convert pdf data to raw text""" tmp_name = self.saveFile(doc) if sys.platform == 'win32': html = self.execute('pdftohtml -stdout -i -noframes "%s"' % tmp_name) if encoding: return html2text(html, ignore_tags=('img', ), indent_width=4, page_width=80), encoding else: return html2text(html, ignore_tags=('img', ), indent_width=4, page_width=80), 'utf-8' else: if encoding: return self.execute('pdftotext -enc UTF-8 "%s" -' % tmp_name), encoding return self.execute('pdftotext -enc UTF-8 "%s" -' % tmp_name), 'utf-8'
def xls_convert(self, doc): """Convert Excel document to raw text""" tmp_name = self.saveFile(doc) if sys.platform == 'win32': html = self.execute('xlhtml "%s"' % tmp_name) return html2text(html, ignore_tags=('img',), indent_width=4, page_width=80) else: return self.execute('xls2csv -d UTF-8 -q 0 "%s" 2> /dev/null' % tmp_name)
def xls_convert(self, doc): """Convert Excel document to raw text""" tmp_name = self.saveFile(doc) if sys.platform == 'win32': html = self.execute('xlhtml "%s"' % tmp_name) return html2text(html, ignore_tags=('img', ), indent_width=4, page_width=80) else: return self.execute('xls2csv -d UTF-8 -q 0 "%s" 2> /dev/null' % tmp_name)