Example #1
0
def Book(options,
         logger,
         font_delta=0,
         header=None,
         profile=PRS500_PROFILE,
         **settings):
    from uuid import uuid4
    ps = {}
    ps['topmargin'] = options.top_margin
    ps['evensidemargin'] = options.left_margin
    ps['oddsidemargin'] = options.left_margin
    ps['textwidth'] = profile.screen_width - (options.left_margin +
                                              options.right_margin)
    ps['textheight']     = profile.screen_height - (options.top_margin + options.bottom_margin) \
                                                 - profile.fudge
    if header:
        hdr = Header()
        hb = TextBlock(textStyle=TextStyle(align='foot',
                                           fontsize=int(
                                               profile.header_font_size * 10)),
                       blockStyle=BlockStyle(blockwidth=ps['textwidth']))
        hb.append(header)
        hdr.PutObj(hb)
        ps['headheight'] = profile.header_height
        ps['headsep'] = options.header_separation
        ps['header'] = hdr
        ps['topmargin'] = 0
        ps['textheight'] = profile.screen_height - (options.bottom_margin + ps['topmargin']) \
                                                 - ps['headheight'] - ps['headsep'] - profile.fudge

    fontsize = int(10 * profile.font_size + font_delta * 20)
    baselineskip = fontsize + 20
    fonts = find_custom_fonts(options, logger)
    tsd = dict(fontsize=fontsize,
               parindent=int(10 * profile.parindent),
               linespace=int(10 * profile.line_space),
               baselineskip=baselineskip,
               wordspace=10 * options.wordspace)
    if fonts['serif'] and 'normal' in fonts['serif']:
        tsd['fontfacename'] = fonts['serif']['normal'][1]

    book = _Book(textstyledefault=tsd,
                 pagestyledefault=ps,
                 blockstyledefault=dict(blockwidth=ps['textwidth']),
                 bookid=uuid4().hex,
                 **settings)
    for family in fonts.keys():
        if fonts[family]:
            for font in fonts[family].values():
                book.embed_font(*font)
                FONT_FILE_MAP[font[1]] = font[0]

    for family in ['serif', 'sans', 'mono']:
        if not fonts[family]:
            fonts[family] = {'normal': (None, profile.default_fonts[family])}
        elif 'normal' not in fonts[family]:
            raise ConversionError('Could not find the normal version of the ' +
                                  family + ' font')
    return book, fonts
Example #2
0
def process_file(lrfpath, opts, logger=None):
    if logger is None:
        level = logging.DEBUG if opts.verbose else logging.INFO
        logger = logging.getLogger('lrf2html')
        setup_cli_handlers(logger, level)
    if opts.out is None:
        opts.out = os.getcwdu()
    else:
        opts.out = os.path.abspath(opts.out)
        if not os.path.isdir(opts.out):
            raise ConversionError(opts.out + ' is not a directory')
    if not os.path.exists(opts.out):
        os.makedirs(opts.out)

    document = LRFDocument(open(lrfpath, 'rb'))
    LRFConverter(document, opts, logger)
Example #3
0
def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
    '''
    Convert the pdf into html using the pdftohtml app.
    This will write the html as index.html into output_dir.
    It will also write all extracted images to the output_dir
    '''

    pdfsrc = os.path.join(output_dir, 'src.pdf')
    index = os.path.join(output_dir, 'index.' + ('xml' if as_xml else 'html'))

    with lopen(pdf_path, 'rb') as src, lopen(pdfsrc, 'wb') as dest:
        shutil.copyfileobj(src, dest)

    with CurrentDir(output_dir):

        def a(x):
            return os.path.basename(x)

        exe = PDFTOHTML
        cmd = [
            exe, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', '-nodrm',
            a(pdfsrc),
            a(index)
        ]

        if isbsd:
            cmd.remove('-nodrm')
        if no_images:
            cmd.append('-i')
        if as_xml:
            cmd.append('-xml')

        logf = PersistentTemporaryFile('pdftohtml_log')
        try:
            p = popen(cmd,
                      stderr=logf._fd,
                      stdout=logf._fd,
                      stdin=subprocess.PIPE)
        except OSError as err:
            if err.errno == errno.ENOENT:
                raise ConversionError(
                    _('Could not find pdftohtml, check it is in your PATH'))
            else:
                raise
        ret = eintr_retry_call(p.wait)
        logf.flush()
        logf.close()
        out = lopen(logf.name, 'rb').read().decode('utf-8', 'replace').strip()
        if ret != 0:
            raise ConversionError('pdftohtml failed with return code: %d\n%s' %
                                  (ret, out))
        if out:
            prints("pdftohtml log:")
            prints(out)
        if not os.path.exists(index) or os.stat(index).st_size < 100:
            raise DRMError()

        if not as_xml:
            with lopen(index, 'r+b') as i:
                raw = i.read().decode('utf-8')
                raw = flip_images(raw)
                raw = raw.replace(
                    '<head',
                    '<!-- created by calibre\'s pdftohtml -->\n  <head', 1)
                i.seek(0)
                i.truncate()
                # versions of pdftohtml >= 0.20 output self closing <br> tags, this
                # breaks the pdf heuristics regexps, so replace them
                raw = raw.replace('<br/>', '<br>')
                raw = re.sub(r'<a\s+name=(\d+)',
                             r'<a id="\1"',
                             raw,
                             flags=re.I)
                raw = re.sub(r'<a id="(\d+)"', r'<a id="p\1"', raw, flags=re.I)
                raw = re.sub(r'<a href="index.html#(\d+)"',
                             r'<a href="#p\1"',
                             raw,
                             flags=re.I)
                raw = replace_entities(raw)
                raw = raw.replace('\u00a0', ' ')

                i.write(raw.encode('utf-8'))

            cmd = [
                exe, '-f', '1', '-l', '1', '-xml', '-i', '-enc', 'UTF-8',
                '-noframes', '-p', '-nomerge', '-nodrm', '-q', '-stdout',
                a(pdfsrc)
            ]
            if isbsd:
                cmd.remove('-nodrm')
            p = popen(cmd, stdout=subprocess.PIPE)
            raw = p.stdout.read().strip()
            if p.wait() == 0 and raw:
                parse_outline(raw, output_dir)

        try:
            os.remove(pdfsrc)
        except:
            pass
Example #4
0
def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
    '''
    Convert the pdf into html using the pdftohtml app.
    This will write the html as index.html into output_dir.
    It will also write all extracted images to the output_dir
    '''

    pdfsrc = os.path.join(output_dir, u'src.pdf')
    index = os.path.join(output_dir, u'index.' + ('xml' if as_xml else 'html'))

    with open(pdf_path, 'rb') as src, open(pdfsrc, 'wb') as dest:
        shutil.copyfileobj(src, dest)

    with CurrentDir(output_dir):
        # This is necessary as pdftohtml doesn't always (linux) respect
        # absolute paths. Also, it allows us to safely pass only bytestring
        # arguments to subprocess on widows

        # subprocess in python 2 cannot handle unicode arguments on windows
        # that cannot be encoded with mbcs. Ensure all args are
        # bytestrings.
        def a(x):
            return os.path.basename(x).encode('ascii')

        exe = PDFTOHTML.encode(filesystem_encoding) if isinstance(
            PDFTOHTML, unicode) else PDFTOHTML

        cmd = [
            exe, b'-enc', b'UTF-8', b'-noframes', b'-p', b'-nomerge',
            b'-nodrm', b'-q',
            a(pdfsrc),
            a(index)
        ]

        if isbsd:
            cmd.remove(b'-nodrm')
        if no_images:
            cmd.append(b'-i')
        if as_xml:
            cmd.append('-xml')

        logf = PersistentTemporaryFile(u'pdftohtml_log')
        try:
            p = popen(cmd,
                      stderr=logf._fd,
                      stdout=logf._fd,
                      stdin=subprocess.PIPE)
        except OSError as err:
            if err.errno == errno.ENOENT:
                raise ConversionError(
                    _('Could not find pdftohtml, check it is in your PATH'))
            else:
                raise

        while True:
            try:
                ret = p.wait()
                break
            except OSError as e:
                if e.errno == errno.EINTR:
                    continue
                else:
                    raise
        logf.flush()
        logf.close()
        out = open(logf.name, 'rb').read().strip()
        if ret != 0:
            raise ConversionError(b'return code: %d\n%s' % (ret, out))
        if out:
            print "pdftohtml log:"
            print out
        if not os.path.exists(index) or os.stat(index).st_size < 100:
            raise DRMError()

        if not as_xml:
            with open(index, 'r+b') as i:
                raw = i.read()
                raw = flip_images(raw)
                raw = '<!-- created by calibre\'s pdftohtml -->\n' + raw
                i.seek(0)
                i.truncate()
                # versions of pdftohtml >= 0.20 output self closing <br> tags, this
                # breaks the pdf heuristics regexps, so replace them
                raw = raw.replace(b'<br/>', b'<br>')
                raw = re.sub(br'<a\s+name=(\d+)',
                             br'<a id="\1"',
                             raw,
                             flags=re.I)
                i.write(raw)

            cmd = [
                exe, b'-f', b'1', '-l', '1', b'-xml', b'-i', b'-enc', b'UTF-8',
                b'-noframes', b'-p', b'-nomerge', b'-nodrm', b'-q', b'-stdout',
                a(pdfsrc)
            ]
            p = popen(cmd, stdout=subprocess.PIPE)
            raw = p.stdout.read().strip()
            if p.wait() == 0 and raw:
                parse_outline(raw, output_dir)

            if isbsd:
                cmd.remove(b'-nodrm')

        try:
            os.remove(pdfsrc)
        except:
            pass
Example #5
0
def XPath(x):
    try:
        return etree.XPath(x, namespaces=XPNSMAP)
    except etree.XPathSyntaxError:
        raise ConversionError(
            'The syntax of the XPath expression %s is invalid.' % repr(x))
Example #6
0
def call_convert_cmd(log, output_dir, pdf_name, first=None, last=None):
    '''
    Convert the pdf into xml/txt using the pdftohtml/text app.
    This will write the output as index.xml/.txt into output_dir.

    pdftotext is often better than pdftohtml.
    '''
    from calibre.ebooks.pdf.pdftohtml import popen

    pdfsrc = os.path.join(output_dir, pdf_name)
    if USE_PDFTOTEXT:
        EXE = 'pdftotext'
        index_file = os.path.join(output_dir, 'index.txt')
    else:
        from calibre.ebooks.pdf.pdftohtml import PDFTOHTML as EXE
        index_file = os.path.join(output_dir, 'index.xml')

    if os.path.exists(index_file):
        os.remove(index_file)

    with CurrentDir(output_dir):
        # This is necessary as pdftohtml doesn't always (linux) respect
        # absolute paths. Also, it allows us to safely pass only bytestring
        # arguments to subprocess on widows

        # subprocess in python 2 cannot handle unicode arguments on windows
        # that cannot be encoded with mbcs. Ensure all args are bytestrings.
        def a(x):
            return os.path.basename(x).encode('ascii')

        exe = EXE.encode(filesystem_encoding) if isinstance(EXE, str) else EXE
        if USE_PDFTOTEXT:
            cmd = [
                exe, b'-enc', b'UTF-8', b'-nopgbrk', b'-q',
                a(pdfsrc),
                a(index_file)
            ]
        else:
            cmd = [
                exe, b'-enc', b'UTF-8', b'-noframes', b'-p', b'-nomerge',
                b'-nodrm', b'-q',
                a(pdfsrc),
                a(index_file), b'-xml', b'-i'
            ]
            if isbsd:
                cmd.remove(b'-nodrm')

        if first is not None:
            cmd.append(b'-f')
            cmd.append(str(first))
        if last is not None:
            cmd.append(b'-l')
            cmd.append(str(last))

        logf = PersistentTemporaryFile('pdftohtml_log')
        try:
            p = popen(cmd,
                      stderr=logf._fd,
                      stdout=logf._fd,
                      stdin=subprocess.PIPE)
        except OSError as err:
            if err.errno == errno.ENOENT:
                raise ConversionError(
                    _('Could not find %s, check it is in your PATH') % EXE)
            else:
                raise

        while True:
            try:
                ret = p.wait()
                break
            except OSError as e:
                if e.errno == errno.EINTR:
                    continue
                else:
                    raise
        logf.flush()
        logf.close()
        out = open(logf.name, 'rb').read().strip()
        if ret != 0:
            raise ConversionError(out)
        if out:
            log('%s log:' % EXE)
            log(out)
        if not os.path.exists(index_file):
            raise DRMError()

        if USE_PDFTOTEXT:
            with open(index_file, 'r', encoding='utf-8', errors='ignore') as f:
                text = f.read()
        else:
            with open(index_file, 'r', encoding='utf-8', errors='ignore') as f:
                # avoid encoding problems
                content = f.read().encode('utf-8')
            parser = etree.XMLParser(recover=True)
            tree = etree.fromstring(clean_ascii_chars(content), parser)
            text = ''.join(e.text or '' for e in tree.iter('text'))
        return text