Ejemplo n.º 1
0
def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
    '''
    Convert the pdf into html using the pdftohtml app.
    This will write the html as index.html into output_dir.
    It will also write all extracted images to the output_dir
    '''

    pdfsrc = os.path.join(output_dir, 'src.pdf')
    index = os.path.join(output_dir, 'index.' + ('xml' if as_xml else 'html'))

    with lopen(pdf_path, 'rb') as src, lopen(pdfsrc, 'wb') as dest:
        shutil.copyfileobj(src, dest)

    with CurrentDir(output_dir):

        def a(x):
            return os.path.basename(x)

        exe = PDFTOHTML
        cmd = [
            exe, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', '-nodrm',
            a(pdfsrc),
            a(index)
        ]

        if isbsd:
            cmd.remove('-nodrm')
        if no_images:
            cmd.append('-i')
        if as_xml:
            cmd.append('-xml')

        logf = PersistentTemporaryFile('pdftohtml_log')
        try:
            p = popen(cmd,
                      stderr=logf._fd,
                      stdout=logf._fd,
                      stdin=subprocess.PIPE)
        except OSError as err:
            if err.errno == errno.ENOENT:
                raise ConversionError(
                    _('Could not find pdftohtml, check it is in your PATH'))
            else:
                raise
        ret = eintr_retry_call(p.wait)
        logf.flush()
        logf.close()
        out = lopen(logf.name, 'rb').read().decode('utf-8', 'replace').strip()
        if ret != 0:
            raise ConversionError('pdftohtml failed with return code: %d\n%s' %
                                  (ret, out))
        if out:
            prints("pdftohtml log:")
            prints(out)
        if not os.path.exists(index) or os.stat(index).st_size < 100:
            raise DRMError()

        if not as_xml:
            with lopen(index, 'r+b') as i:
                raw = i.read().decode('utf-8')
                raw = flip_images(raw)
                raw = raw.replace(
                    '<head',
                    '<!-- created by calibre\'s pdftohtml -->\n  <head', 1)
                i.seek(0)
                i.truncate()
                # versions of pdftohtml >= 0.20 output self closing <br> tags, this
                # breaks the pdf heuristics regexps, so replace them
                raw = raw.replace('<br/>', '<br>')
                raw = re.sub(r'<a\s+name=(\d+)',
                             r'<a id="\1"',
                             raw,
                             flags=re.I)
                raw = re.sub(r'<a id="(\d+)"', r'<a id="p\1"', raw, flags=re.I)
                raw = re.sub(r'<a href="index.html#(\d+)"',
                             r'<a href="#p\1"',
                             raw,
                             flags=re.I)
                raw = replace_entities(raw)
                raw = raw.replace('\u00a0', ' ')

                i.write(raw.encode('utf-8'))

            cmd = [
                exe, '-f', '1', '-l', '1', '-xml', '-i', '-enc', 'UTF-8',
                '-noframes', '-p', '-nomerge', '-nodrm', '-q', '-stdout',
                a(pdfsrc)
            ]
            if isbsd:
                cmd.remove('-nodrm')
            p = popen(cmd, stdout=subprocess.PIPE)
            raw = p.stdout.read().strip()
            if p.wait() == 0 and raw:
                parse_outline(raw, output_dir)

        try:
            os.remove(pdfsrc)
        except:
            pass
Ejemplo n.º 2
0
def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
    '''
    Convert the pdf into html using the pdftohtml app.
    This will write the html as index.html into output_dir.
    It will also write all extracted images to the output_dir
    '''

    pdfsrc = os.path.join(output_dir, u'src.pdf')
    index = os.path.join(output_dir, u'index.'+('xml' if as_xml else 'html'))

    with open(pdf_path, 'rb') as src, open(pdfsrc, 'wb') as dest:
        shutil.copyfileobj(src, dest)

    with CurrentDir(output_dir):
        # This is necessary as pdftohtml doesn't always (linux) respect
        # absolute paths. Also, it allows us to safely pass only bytestring
        # arguments to subprocess on widows

        # subprocess in python 2 cannot handle unicode arguments on windows
        # that cannot be encoded with mbcs. Ensure all args are
        # bytestrings.
        def a(x):
            return os.path.basename(x).encode('ascii')

        exe = PDFTOHTML.encode(filesystem_encoding) if isinstance(PDFTOHTML,
                unicode) else PDFTOHTML

        cmd = [exe, b'-enc', b'UTF-8', b'-noframes', b'-p', b'-nomerge',
                b'-nodrm', b'-q', a(pdfsrc), a(index)]

        if isbsd:
            cmd.remove(b'-nodrm')
        if no_images:
            cmd.append(b'-i')
        if as_xml:
            cmd.append('-xml')

        logf = PersistentTemporaryFile(u'pdftohtml_log')
        try:
            p = popen(cmd, stderr=logf._fd, stdout=logf._fd,
                    stdin=subprocess.PIPE)
        except OSError as err:
            if err.errno == errno.ENOENT:
                raise ConversionError(
                    _('Could not find pdftohtml, check it is in your PATH'))
            else:
                raise

        while True:
            try:
                ret = p.wait()
                break
            except OSError as e:
                if e.errno == errno.EINTR:
                    continue
                else:
                    raise
        logf.flush()
        logf.close()
        out = open(logf.name, 'rb').read().strip()
        if ret != 0:
            raise ConversionError(b'return code: %d\n%s' % (ret, out))
        if out:
            print "pdftohtml log:"
            print out
        if not os.path.exists(index) or os.stat(index).st_size < 100:
            raise DRMError()

        if not as_xml:
            with open(index, 'r+b') as i:
                raw = i.read()
                raw = flip_images(raw)
                raw = '<!-- created by calibre\'s pdftohtml -->\n' + raw
                i.seek(0)
                i.truncate()
                # versions of pdftohtml >= 0.20 output self closing <br> tags, this
                # breaks the pdf heuristics regexps, so replace them
                raw = raw.replace(b'<br/>', b'<br>')
                raw = re.sub(br'<a\s+name=(\d+)', br'<a id="\1"', raw, flags=re.I)
                i.write(raw)

            cmd = [exe, b'-f', b'1', '-l', '1', b'-xml', b'-i', b'-enc', b'UTF-8', b'-noframes', b'-p', b'-nomerge',
                    b'-nodrm', b'-q', b'-stdout', a(pdfsrc)]
            p = popen(cmd, stdout=subprocess.PIPE)
            raw = p.stdout.read().strip()
            if p.wait() == 0 and raw:
                parse_outline(raw, output_dir)

            if isbsd:
                cmd.remove(b'-nodrm')

        try:
            os.remove(pdfsrc)
        except:
            pass
Ejemplo n.º 3
0
def call_convert_cmd(log, output_dir, pdf_name, first=None, last=None):
    '''
    Convert the pdf into xml/txt using the pdftohtml/text app.
    This will write the output as index.xml/.txt into output_dir.

    pdftotext is often better than pdftohtml.
    '''
    from calibre.ebooks.pdf.pdftohtml import popen

    pdfsrc = os.path.join(output_dir, pdf_name)
    if USE_PDFTOTEXT:
        EXE = 'pdftotext'
        index_file = os.path.join(output_dir, 'index.txt')
    else:
        from calibre.ebooks.pdf.pdftohtml import PDFTOHTML as EXE
        index_file = os.path.join(output_dir, 'index.xml')

    if os.path.exists(index_file):
        os.remove(index_file)

    with CurrentDir(output_dir):
        # This is necessary as pdftohtml doesn't always (linux) respect
        # absolute paths. Also, it allows us to safely pass only bytestring
        # arguments to subprocess on widows

        # subprocess in python 2 cannot handle unicode arguments on windows
        # that cannot be encoded with mbcs. Ensure all args are bytestrings.
        def a(x):
            return os.path.basename(x).encode('ascii')

        exe = EXE.encode(filesystem_encoding) if isinstance(EXE, str) else EXE
        if USE_PDFTOTEXT:
            cmd = [
                exe, b'-enc', b'UTF-8', b'-nopgbrk', b'-q',
                a(pdfsrc),
                a(index_file)
            ]
        else:
            cmd = [
                exe, b'-enc', b'UTF-8', b'-noframes', b'-p', b'-nomerge',
                b'-nodrm', b'-q',
                a(pdfsrc),
                a(index_file), b'-xml', b'-i'
            ]
            if isbsd:
                cmd.remove(b'-nodrm')

        if first is not None:
            cmd.append(b'-f')
            cmd.append(str(first))
        if last is not None:
            cmd.append(b'-l')
            cmd.append(str(last))

        logf = PersistentTemporaryFile('pdftohtml_log')
        try:
            p = popen(cmd,
                      stderr=logf._fd,
                      stdout=logf._fd,
                      stdin=subprocess.PIPE)
        except OSError as err:
            if err.errno == errno.ENOENT:
                raise ConversionError(
                    _('Could not find %s, check it is in your PATH') % EXE)
            else:
                raise

        while True:
            try:
                ret = p.wait()
                break
            except OSError as e:
                if e.errno == errno.EINTR:
                    continue
                else:
                    raise
        logf.flush()
        logf.close()
        out = open(logf.name, 'rb').read().strip()
        if ret != 0:
            raise ConversionError(out)
        if out:
            log('%s log:' % EXE)
            log(out)
        if not os.path.exists(index_file):
            raise DRMError()

        if USE_PDFTOTEXT:
            with open(index_file, 'r', encoding='utf-8', errors='ignore') as f:
                text = f.read()
        else:
            with open(index_file, 'r', encoding='utf-8', errors='ignore') as f:
                # avoid encoding problems
                content = f.read().encode('utf-8')
            parser = etree.XMLParser(recover=True)
            tree = etree.fromstring(clean_ascii_chars(content), parser)
            text = ''.join(e.text or '' for e in tree.iter('text'))
        return text
Ejemplo n.º 4
0
def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
    '''
    Convert the pdf into html using the pdftohtml app.
    This will write the html as index.html into output_dir.
    It will also write all extracted images to the output_dir
    '''

    pdfsrc = os.path.join(output_dir, u'src.pdf')
    index = os.path.join(output_dir, u'index.' + ('xml' if as_xml else 'html'))

    with open(pdf_path, 'rb') as src, open(pdfsrc, 'wb') as dest:
        shutil.copyfileobj(src, dest)

    with CurrentDir(output_dir):
        # This is necessary as pdftohtml doesn't always (linux) respect
        # absolute paths. Also, it allows us to safely pass only bytestring
        # arguments to subprocess on widows

        # subprocess in python 2 cannot handle unicode arguments on windows
        # that cannot be encoded with mbcs. Ensure all args are
        # bytestrings.
        def a(x):
            return os.path.basename(x).encode('ascii')

        exe = PDFTOHTML.encode(filesystem_encoding) if isinstance(
            PDFTOHTML, unicode) else PDFTOHTML

        cmd = [
            exe, b'-enc', b'UTF-8', b'-noframes', b'-p', b'-nomerge',
            b'-nodrm', b'-q',
            a(pdfsrc),
            a(index)
        ]

        if isbsd:
            cmd.remove(b'-nodrm')
        if no_images:
            cmd.append(b'-i')
        if as_xml:
            cmd.append('-xml')

        logf = PersistentTemporaryFile(u'pdftohtml_log')
        try:
            p = popen(cmd,
                      stderr=logf._fd,
                      stdout=logf._fd,
                      stdin=subprocess.PIPE)
        except OSError as err:
            if err.errno == errno.ENOENT:
                raise ConversionError(
                    _('Could not find pdftohtml, check it is in your PATH'))
            else:
                raise

        while True:
            try:
                ret = p.wait()
                break
            except OSError as e:
                if e.errno == errno.EINTR:
                    continue
                else:
                    raise
        logf.flush()
        logf.close()
        out = open(logf.name, 'rb').read().strip()
        if ret != 0:
            raise ConversionError(b'return code: %d\n%s' % (ret, out))
        if out:
            print "pdftohtml log:"
            print out
        if not os.path.exists(index) or os.stat(index).st_size < 100:
            raise DRMError()

        if not as_xml:
            with open(index, 'r+b') as i:
                raw = i.read()
                raw = flip_images(raw)
                raw = '<!-- created by calibre\'s pdftohtml -->\n' + raw
                i.seek(0)
                i.truncate()
                # versions of pdftohtml >= 0.20 output self closing <br> tags, this
                # breaks the pdf heuristics regexps, so replace them
                raw = raw.replace(b'<br/>', b'<br>')
                raw = re.sub(br'<a\s+name=(\d+)',
                             br'<a id="\1"',
                             raw,
                             flags=re.I)
                i.write(raw)

            cmd = [
                exe, b'-f', b'1', '-l', '1', b'-xml', b'-i', b'-enc', b'UTF-8',
                b'-noframes', b'-p', b'-nomerge', b'-nodrm', b'-q', b'-stdout',
                a(pdfsrc)
            ]
            p = popen(cmd, stdout=subprocess.PIPE)
            raw = p.stdout.read().strip()
            if p.wait() == 0 and raw:
                parse_outline(raw, output_dir)

            if isbsd:
                cmd.remove(b'-nodrm')

        try:
            os.remove(pdfsrc)
        except:
            pass
Ejemplo n.º 5
0
def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
    '''
    Convert the pdf into html using the pdftohtml app.
    This will write the html as index.html into output_dir.
    It will also write all extracted images to the output_dir
    '''

    pdfsrc = os.path.join(output_dir, 'src.pdf')
    index = os.path.join(output_dir, 'index.'+('xml' if as_xml else 'html'))

    with lopen(pdf_path, 'rb') as src, lopen(pdfsrc, 'wb') as dest:
        shutil.copyfileobj(src, dest)

    with CurrentDir(output_dir):

        def a(x):
            return os.path.basename(x)

        exe = PDFTOHTML
        cmd = [exe, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge',
                '-nodrm', a(pdfsrc), a(index)]

        if isbsd:
            cmd.remove('-nodrm')
        if no_images:
            cmd.append('-i')
        if as_xml:
            cmd.append('-xml')

        logf = PersistentTemporaryFile('pdftohtml_log')
        try:
            p = popen(cmd, stderr=logf._fd, stdout=logf._fd,
                    stdin=subprocess.PIPE)
        except OSError as err:
            if err.errno == errno.ENOENT:
                raise ConversionError(
                    _('Could not find pdftohtml, check it is in your PATH'))
            else:
                raise
        ret = eintr_retry_call(p.wait)
        logf.flush()
        logf.close()
        out = lopen(logf.name, 'rb').read().decode('utf-8', 'replace').strip()
        if ret != 0:
            raise ConversionError('pdftohtml failed with return code: %d\n%s' % (ret, out))
        if out:
            prints("pdftohtml log:")
            prints(out)
        if not os.path.exists(index) or os.stat(index).st_size < 100:
            raise DRMError()

        if not as_xml:
            with lopen(index, 'r+b') as i:
                raw = i.read().decode('utf-8')
                raw = flip_images(raw)
                raw = raw.replace('<head', '<!-- created by calibre\'s pdftohtml -->\n  <head', 1)
                i.seek(0)
                i.truncate()
                # versions of pdftohtml >= 0.20 output self closing <br> tags, this
                # breaks the pdf heuristics regexps, so replace them
                raw = raw.replace('<br/>', '<br>')
                raw = re.sub(r'<a\s+name=(\d+)', r'<a id="\1"', raw, flags=re.I)
                raw = re.sub(r'<a id="(\d+)"', r'<a id="p\1"', raw, flags=re.I)
                raw = re.sub(r'<a href="index.html#(\d+)"', r'<a href="#p\1"', raw, flags=re.I)
                raw = replace_entities(raw)
                raw = raw.replace('\u00a0', ' ')

                i.write(raw.encode('utf-8'))

            cmd = [exe, '-f', '1', '-l', '1', '-xml', '-i', '-enc', 'UTF-8', '-noframes', '-p', '-nomerge',
                    '-nodrm', '-q', '-stdout', a(pdfsrc)]
            if isbsd:
                cmd.remove('-nodrm')
            p = popen(cmd, stdout=subprocess.PIPE)
            raw = p.stdout.read().strip()
            if p.wait() == 0 and raw:
                parse_outline(raw, output_dir)

        try:
            os.remove(pdfsrc)
        except:
            pass