Example #1
0
def pdf_convert2(self, doc, encoding, mimetype):
    """Convert pdf data to raw text"""
    tmp_name = self.saveFile(doc)
    if sys.platform == 'win32':
        html = self.execute('pdftohtml -stdout -i -noframes "%s"' % tmp_name)
        if encoding:
            return html2text(html,
                             ignore_tags=('img',),
                             indent_width=4,
                             page_width=80), encoding
        else:
            return html2text(html,
                             ignore_tags=('img',),
                             indent_width=4,
                             page_width=80), 'utf-8'
    else:
        if encoding:
            return self.execute('pdftotext -enc UTF-8 "%s" -' % tmp_name), encoding
        return self.execute('pdftotext -enc UTF-8 "%s" -' % tmp_name), 'utf-8'
Example #2
0
def pdf_convert2(self, doc, encoding, mimetype):
    """Convert pdf data to raw text"""
    tmp_name = self.saveFile(doc)
    if sys.platform == 'win32':
        html = self.execute('pdftohtml -stdout -i -noframes "%s"' % tmp_name)
        if encoding:
            return html2text(html,
                             ignore_tags=('img', ),
                             indent_width=4,
                             page_width=80), encoding
        else:
            return html2text(html,
                             ignore_tags=('img', ),
                             indent_width=4,
                             page_width=80), 'utf-8'
    else:
        if encoding:
            return self.execute('pdftotext -enc UTF-8 "%s" -' %
                                tmp_name), encoding
        return self.execute('pdftotext -enc UTF-8 "%s" -' % tmp_name), 'utf-8'
Example #3
0
def xls_convert(self, doc):
    """Convert Excel document to raw text"""
    tmp_name = self.saveFile(doc)
    if sys.platform == 'win32':
        html = self.execute('xlhtml "%s"' % tmp_name)
        return html2text(html,
                     ignore_tags=('img',),
                     indent_width=4,
                     page_width=80)
    else:
        return self.execute('xls2csv -d UTF-8 -q 0 "%s" 2> /dev/null' % tmp_name)
Example #4
0
def xls_convert(self, doc):
    """Convert Excel document to raw text"""
    tmp_name = self.saveFile(doc)
    if sys.platform == 'win32':
        html = self.execute('xlhtml "%s"' % tmp_name)
        return html2text(html,
                         ignore_tags=('img', ),
                         indent_width=4,
                         page_width=80)
    else:
        return self.execute('xls2csv -d UTF-8 -q 0 "%s" 2> /dev/null' %
                            tmp_name)