def generate_toc(self, oeb_book, ref_url, output_dir):
        '''
        Generate table of contents
        '''

        with CurrentDir(output_dir):

            def build_node(current_node, parent=None):
                if parent is None:
                    parent = etree.Element('ul')
                elif len(current_node.nodes):
                    parent = element(parent, ('ul'))
                for node in current_node.nodes:
                    point = element(parent, 'li')
                    href = relpath(os.path.abspath(unquote(node.href)),
                                   os.path.dirname(ref_url))
                    if isinstance(href, bytes):
                        href = href.decode('utf-8')
                    link = element(point, 'a', href=clean_xml_chars(href))
                    title = node.title
                    if isinstance(title, bytes):
                        title = title.decode('utf-8')
                    if title:
                        title = re.sub(r'\s+', ' ', title)
                    link.text = clean_xml_chars(title)
                    build_node(node, point)
                return parent

            wrap = etree.Element('div')
            wrap.append(build_node(oeb_book.toc))
            return wrap
 def get_comics_from_collection(self, stream):
     from ebook_converter.libunzip import extract as zipextract
     tdir = PersistentTemporaryDirectory('_comic_collection')
     zipextract(stream, tdir)
     comics = []
     with CurrentDir(tdir):
         if not os.path.exists('comics.txt'):
             raise ValueError(
                 ('%s is not a valid comic collection'
                  ' no comics.txt was found in the file') % stream.name)
         with open('comics.txt', 'rb') as f:
             raw = f.read()
         if raw.startswith(codecs.BOM_UTF16_BE):
             raw = raw.decode('utf-16-be')[1:]
         elif raw.startswith(codecs.BOM_UTF16_LE):
             raw = raw.decode('utf-16-le')[1:]
         elif raw.startswith(codecs.BOM_UTF8):
             raw = raw.decode('utf-8')[1:]
         else:
             raw = raw.decode('utf-8')
         for line in raw.splitlines():
             line = line.strip()
             if not line:
                 continue
             fname, title = line.partition(':')[0], line.partition(':')[-1]
             fname = fname.replace('#', '_')
             fname = os.path.join(tdir, *fname.split('/'))
             if not title:
                 title = os.path.basename(fname).rpartition('.')[0]
             if os.access(fname, os.R_OK):
                 comics.append([title, fname])
     if not comics:
         raise ValueError('%s has no comics' % stream.name)
     return comics
Example #3
0
def test():  # {{{
    # TODO(gryf): move this test to separate file.
    from ebook_converter.ptempfile import TemporaryDirectory
    from ebook_converter import CurrentDir
    from glob import glob
    # TODO(gryf): make the sample image out of pillow or smth
    # img = image_from_data(I('lt.png', data=True, allow_user_override=False))
    with TemporaryDirectory() as tdir, CurrentDir(tdir):
        save_image(img, 'test.jpg')
        ret = optimize_jpeg('test.jpg')
        if ret is not None:
            raise SystemExit('optimize_jpeg failed: %s' % ret)
        ret = encode_jpeg('test.jpg')
        if ret is not None:
            raise SystemExit('encode_jpeg failed: %s' % ret)
        # TODO(gryf): make the sample image out of pillow or smth. for sure
        # tempfile would be better idea.
        #shutil.copyfile(I('lt.png'), 'test.png')
        ret = optimize_png('test.png')
        if ret is not None:
            raise SystemExit('optimize_png failed: %s' % ret)
        if glob('*.bak'):
            raise SystemExit('Spurious .bak files left behind')
    quantize_image(img)
    oil_paint_image(img)
    gaussian_sharpen_image(img)
    gaussian_blur_image(img)
    despeckle_image(img)
    remove_borders_from_image(img)
    image_to_data(img, fmt='GIF')
    raw = subprocess.Popen([get_exe_path('JxrDecApp'), '-h'], creationflags=0, stdout=subprocess.PIPE).stdout.read()
    if b'JPEG XR Decoder Utility' not in raw:
        raise SystemExit('Failed to run JxrDecApp')
Example #4
0
    def extract_content(self, output_dir):
        from ebook_converter.ebooks.pml.pmlconverter import pml_to_html

        output_dir = os.path.abspath(output_dir)

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        pml = ''
        for i in range(1, self.header_record.num_text_pages + 1):
            self.log.debug('Extracting text page %i' % i)
            pml += self.get_text_page(i)

        title = self.mi.title
        if not isinstance(title, str):
            title = title.decode('utf-8', 'replace')

        html = '<html><head><title>%s</title></head><body>%s</body></html>' % \
            (title, pml_to_html(pml))

        with CurrentDir(output_dir):
            with open('index.html', 'wb') as index:
                self.log.debug('Writing text to index.html')
                index.write(html.encode('utf-8'))

        if not os.path.exists(os.path.join(output_dir, 'images/')):
            os.makedirs(os.path.join(output_dir, 'images/'))
        images = []
        with CurrentDir(os.path.join(output_dir, 'images/')):
            for i in range(self.header_record.non_text_offset,
                           len(self.sections)):
                name, img = self.get_image(i)
                if name:
                    images.append(name)
                    with open(name, 'wb') as imgf:
                        self.log.debug('Writing image %s to images/' % name)
                        imgf.write(img)

        opf_path = self.create_opf(output_dir, images)

        return opf_path
Example #5
0
def do_explode(path, dest):
    with open(path, 'rb') as stream:
        mr = MobiReader(stream, default_log, None, None)

        with CurrentDir(dest):
            mr = Mobi8Reader(mr, default_log)
            opf = os.path.abspath(mr())
            try:
                os.remove('debug-raw.html')
            except Exception:
                pass

    return opf
Example #6
0
    def dump_images(self, output_dir):
        '''
        This is primarily used for debugging and 3rd party tools to
        get the images in the file.
        '''
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        with CurrentDir(output_dir):
            for i in range(0, self.header_record.num_image_pages):
                name, img = self.get_image(self.header_record.image_data_offset + i)
                with open(name, 'wb') as imgf:
                    imgf.write(img)
Example #7
0
    def create_opf(self, output_dir, images):
        with CurrentDir(output_dir):
            opf = OPFCreator(output_dir, self.mi)

            manifest = [('index.html', None)]

            for i in images:
                manifest.append((os.path.join('images/', i), None))

            opf.create_manifest(manifest)
            opf.create_spine(['index.html'])
            with open('metadata.opf', 'wb') as opffile:
                opf.render(opffile)

        return os.path.join(output_dir, 'metadata.opf')
    def encrypt_fonts(self, uris, tdir, _uuid):  # {{{
        from ebook_converter.polyglot.binary import from_hex_bytes

        key = re.sub(r'[^a-fA-F0-9]', '', _uuid)
        if len(key) < 16:
            raise ValueError('UUID identifier %r is invalid'% _uuid)
        key = bytearray(from_hex_bytes((key + key)[:32]))
        paths = []
        with CurrentDir(tdir):
            paths = [os.path.join(*x.split('/')) for x in uris]
            uris = dict(zip(uris, paths))
            fonts = []
            for uri in list(uris.keys()):
                path = uris[uri]
                if not os.path.exists(path):
                    uris.pop(uri)
                    continue
                self.log.debug('Encrypting font:', uri)
                with open(path, 'r+b') as f:
                    data = f.read(1024)
                    if len(data) >= 1024:
                        data = bytearray(data)
                        f.seek(0)
                        f.write(bytes(bytearray(data[i] ^ key[i%16]
                                                for i in range(1024))))
                    else:
                        self.log.warn('Font', path, 'is invalid, ignoring')
                if not isinstance(uri, str):
                    uri = uri.decode('utf-8')
                fonts.append('''
                <enc:EncryptedData>
                    <enc:EncryptionMethod Algorithm="http://ns.adobe.com/pdf/enc#RC"/>
                    <enc:CipherData>
                    <enc:CipherReference URI="%s"/>
                    </enc:CipherData>
                </enc:EncryptedData>
                '''%(uri.replace('"', '\\"')))
            if fonts:
                ans = '''<encryption
                    xmlns="urn:oasis:names:tc:opendocument:xmlns:container"
                    xmlns:enc="http://www.w3.org/2001/04/xmlenc#"
                    xmlns:deenc="http://ns.adobe.com/digitaleditions/enc">
                    '''
                ans += '\n'.join(fonts)
                ans += '\n</encryption>'
                return ans
Example #9
0
    def __call__(self, stream, options, file_ext, log, accelerators,
                 output_dir):
        try:
            log('InputFormatPlugin: %s running' % self.name)
            if hasattr(stream, 'name'):
                log('on', stream.name)
        except:
            # In case stdout is broken
            pass

        with CurrentDir(output_dir):
            for x in os.listdir('.'):
                shutil.rmtree(x) if os.path.isdir(x) else os.remove(x)

            ret = self.convert(stream, options, file_ext, log, accelerators)

        return ret
Example #10
0
    def __call__(self, stream, odir, log):
        from ebook_converter.utils.zipfile import ZipFile
        from ebook_converter.ebooks.metadata.odt import get_metadata
        from ebook_converter.ebooks.metadata.opf2 import OPFCreator

        if not os.path.exists(odir):
            os.makedirs(odir)
        with CurrentDir(odir):
            log('Extracting ODT file...')
            stream.seek(0)
            mi = get_metadata(stream, 'odt')
            if not mi.title:
                mi.title = 'Unknown'
            if not mi.authors:
                mi.authors = ['Unknown']
            self.filter_load(stream, mi, log)

            # NOTE(gryf): Here is a workaround for ODF2XHTML.xhtml() method,
            # which expects, that all lines are strings.
            html = ''.join([str(l) for l in self.lines])

            # A blanket img specification like this causes problems
            # with EPUB output as the containing element often has
            # an absolute height and width set that is larger than
            # the available screen real estate
            html = html.replace('img { width: 100%; height: 100%; }', '')
            # odf2xhtml creates empty title tag
            html = html.replace('<title></title>',
                                '<title>%s</title>' % (mi.title, ))
            try:
                html = self.fix_markup(html, log)
            except:
                log.exception('Failed to filter CSS, conversion may be slow')
            with open('index.xhtml', 'wb') as f:
                f.write(as_bytes(html))
            zf = ZipFile(stream, 'r')
            self.extract_pictures(zf)
            opf = OPFCreator(os.path.abspath(os.getcwd()), mi)
            opf.create_manifest([(os.path.abspath(f2), None)
                                 for f2 in walk(os.getcwd())])
            opf.create_spine([os.path.abspath('index.xhtml')])
            with open('metadata.opf', 'wb') as f:
                opf.render(f)
            return os.path.abspath('metadata.opf')
Example #11
0
    def convert(self, oeb_book, output_path, input_plugin, opts, log):

        self.log, self.opts = log, opts
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        with CurrentDir(output_path):
            results = oeb_book.to_opf2(page_map=True)
            for key in (OPF_MIME, NCX_MIME, PAGE_MAP_MIME):
                href, root = results.pop(key, [None, None])
                if root is not None:
                    if key == OPF_MIME:
                        try:
                            self.workaround_nook_cover_bug(root)
                        except:
                            self.log.exception('Something went wrong while trying to'
                                    ' workaround Nook cover bug, ignoring')
                        try:
                            self.workaround_pocketbook_cover_bug(root)
                        except:
                            self.log.exception('Something went wrong while trying to'
                                    ' workaround Pocketbook cover bug, ignoring')
                        self.migrate_lang_code(root)
                    raw = etree.tostring(root, pretty_print=True,
                            encoding='utf-8', xml_declaration=True)
                    if key == OPF_MIME:
                        # Needed as I can't get lxml to output opf:role and
                        # not output <opf:metadata> as well
                        raw = re.sub(br'(<[/]{0,1})opf:', br'\1', raw)
                    with open(href, 'wb') as f:
                        f.write(raw)

            for item in oeb_book.manifest:
                if (
                        not self.opts.expand_css and item.media_type in OEB_STYLES and hasattr(
                            item.data, 'cssText') and 'nook' not in self.opts.output_profile.short_name):
                    condense_sheet(item.data)
                path = os.path.abspath(unquote(item.href))
                dir = os.path.dirname(path)
                if not os.path.exists(dir):
                    os.makedirs(dir)
                with open(path, 'wb') as f:
                    f.write(item.bytes_representation)
                item.unload_data_from_memory(memory=path)
Example #12
0
    def create_opf(self, output_dir, images, toc):
        with CurrentDir(output_dir):
            if 'cover.png' in images:
                self.mi.cover = os.path.join('images', 'cover.png')

            opf = OPFCreator(output_dir, self.mi)

            manifest = [('index.html', None)]

            for i in images:
                manifest.append((os.path.join('images', i), None))

            opf.create_manifest(manifest)
            opf.create_spine(['index.html'])
            opf.set_toc(toc)
            with open('metadata.opf', 'wb') as opffile:
                with open('toc.ncx', 'wb') as tocfile:
                    opf.render(opffile, tocfile, 'toc.ncx')

        return os.path.join(output_dir, 'metadata.opf')
Example #13
0
def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
    '''
    Convert the pdf into html using the pdftohtml app.
    This will write the html as index.html into output_dir.
    It will also write all extracted images to the output_dir
    '''

    pdfsrc = os.path.join(output_dir, 'src.pdf')
    index = os.path.join(output_dir, 'index.' + ('xml' if as_xml else 'html'))

    with open(pdf_path, 'rb') as src, open(pdfsrc, 'wb') as dest:
        shutil.copyfileobj(src, dest)

    with CurrentDir(output_dir):

        def a(x):
            return os.path.basename(x)

        cmd = [
            'pdftohtml', '-enc', 'UTF-8', '-noframes', '-p', '-nomerge',
            '-nodrm',
            a(pdfsrc),
            a(index)
        ]

        if no_images:
            cmd.append('-i')
        if as_xml:
            cmd.append('-xml')

        logf = PersistentTemporaryFile('pdftohtml_log')
        try:
            p = popen(cmd,
                      stderr=logf._fd,
                      stdout=logf._fd,
                      stdin=subprocess.PIPE)
        except OSError as err:
            if err.errno == errno.ENOENT:
                raise ConversionError('Could not find pdftohtml, check it is '
                                      'in your PATH')
            else:
                raise
        ret = eintr_retry_call(p.wait)
        logf.flush()
        logf.close()
        out = open(logf.name, 'rb').read().decode('utf-8', 'replace').strip()
        if ret != 0:
            raise ConversionError('pdftohtml failed with return code: '
                                  '%d\n%s' % (ret, out))
        if out:
            print("pdftohtml log:")
            print(out)
        if not os.path.exists(index) or os.stat(index).st_size < 100:
            raise DRMError()

        if not as_xml:
            with open(index, 'r+b') as i:
                raw = i.read().decode('utf-8', 'replace')
                raw = flip_images(raw)
                raw = raw.replace(
                    '<head', '<!-- created by ebook-converter\'s'
                    ' pdftohtml -->\n  <head', 1)
                i.seek(0)
                i.truncate()
                # versions of pdftohtml >= 0.20 output self closing <br> tags,
                # this breaks the pdf heuristics regexps, so replace them
                raw = raw.replace('<br/>', '<br>')
                raw = re.sub(r'<a\s+name=(\d+)',
                             r'<a id="\1"',
                             raw,
                             flags=re.I)
                raw = re.sub(r'<a id="(\d+)"', r'<a id="p\1"', raw, flags=re.I)
                raw = re.sub(r'<a href="index.html#(\d+)"',
                             r'<a href="#p\1"',
                             raw,
                             flags=re.I)
                raw = xml_replace_entities(raw)
                raw = raw.replace('\u00a0', ' ')

                i.write(raw.encode('utf-8'))

            cmd = [
                'pdftohtml', '-f', '1', '-l', '1', '-xml', '-i', '-enc',
                'UTF-8', '-noframes', '-p', '-nomerge', '-nodrm', '-q',
                '-stdout',
                a(pdfsrc)
            ]
            p = popen(cmd, stdout=subprocess.PIPE)
            raw = p.stdout.read().strip()
            if p.wait() == 0 and raw:
                parse_outline(raw, output_dir)

        try:
            os.remove(pdfsrc)
        except Exception:
            pass
    def convert(self, oeb_book, output_path, input_plugin, opts, log):
        from lxml import etree
        from ebook_converter.utils import zipfile
        from templite import Templite
        from ebook_converter.polyglot.urllib import unquote
        from ebook_converter.ebooks.html.meta import EasyMeta

        # read template files
        if opts.template_html_index is not None:
            with open(opts.template_html_index, 'rb') as f:
                template_html_index_data = f.read()
        else:
            with open(
                    pkg_resources.resource_filename(
                        'ebook_converter',
                        'data/html_export_default_index.tmpl')) as fobj:
                template_html_index_data = fobj.read().decode()

        if opts.template_html is not None:
            with open(opts.template_html, 'rb') as f:
                template_html_data = f.read()
        else:
            with open(
                    pkg_resources.resource_filename(
                        'ebook_converter',
                        'data/html_export_default.tmpl')) as fobj:
                template_html_data = fobj.read().decode()

        if opts.template_css is not None:
            with open(opts.template_css, 'rb') as f:
                template_css_data = f.read()
        else:
            with open(
                    pkg_resources.resource_filename(
                        'ebook_converter',
                        'data/html_export_default.css')) as fobj:
                template_css_data = fobj.read().decode()

        template_html_index_data = template_html_index_data.decode('utf-8')
        template_html_data = template_html_data.decode('utf-8')
        template_css_data = template_css_data.decode('utf-8')

        self.log = log
        self.opts = opts
        meta = EasyMeta(oeb_book.metadata)

        tempdir = os.path.realpath(PersistentTemporaryDirectory())
        output_file = os.path.join(
            tempdir,
            os.path.basename(re.sub(r'\.zip', '', output_path) + '.html'))
        output_dir = re.sub(r'\.html', '', output_file) + '_files'

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        css_path = output_dir + os.sep + 'calibreHtmlOutBasicCss.css'
        with open(css_path, 'wb') as f:
            f.write(template_css_data.encode('utf-8'))

        with open(output_file, 'wb') as f:
            html_toc = self.generate_html_toc(oeb_book, output_file,
                                              output_dir)
            templite = Templite(template_html_index_data)
            nextLink = oeb_book.spine[0].href
            nextLink = relpath(output_dir + os.sep + nextLink,
                               os.path.dirname(output_file))
            cssLink = relpath(os.path.abspath(css_path),
                              os.path.dirname(output_file))
            tocUrl = relpath(output_file, os.path.dirname(output_file))
            t = templite.render(has_toc=bool(oeb_book.toc.count()),
                                toc=html_toc,
                                meta=meta,
                                nextLink=nextLink,
                                tocUrl=tocUrl,
                                cssLink=cssLink,
                                firstContentPageLink=nextLink)
            if isinstance(t, str):
                t = t.encode('utf-8')
            f.write(t)

        with CurrentDir(output_dir):
            for item in oeb_book.manifest:
                path = os.path.abspath(unquote(item.href))
                dir = os.path.dirname(path)
                if not os.path.exists(dir):
                    os.makedirs(dir)
                if item.spine_position is not None:
                    with open(path, 'wb') as f:
                        pass
                else:
                    with open(path, 'wb') as f:
                        f.write(item.bytes_representation)
                    item.unload_data_from_memory(memory=path)

            for item in oeb_book.spine:
                path = os.path.abspath(unquote(item.href))
                dir = os.path.dirname(path)
                root = item.data.getroottree()

                # get & clean HTML <HEAD>-data
                head = root.xpath(
                    '//h:head',
                    namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
                head_content = etree.tostring(head,
                                              pretty_print=True,
                                              encoding='unicode')
                head_content = re.sub(r'\<\/?head.*\>', '', head_content)
                head_content = re.sub(
                    re.compile(r'\<style.*\/style\>', re.M | re.S), '',
                    head_content)
                head_content = re.sub(r'<(title)([^>]*)/>', r'<\1\2></\1>',
                                      head_content)

                # get & clean HTML <BODY>-data
                body = root.xpath(
                    '//h:body',
                    namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
                ebook_content = etree.tostring(body,
                                               pretty_print=True,
                                               encoding='unicode')
                ebook_content = re.sub(r'\<\/?body.*\>', '', ebook_content)
                ebook_content = re.sub(r'<(div|a|span)([^>]*)/>',
                                       r'<\1\2></\1>', ebook_content)

                # generate link to next page
                if item.spine_position + 1 < len(oeb_book.spine):
                    nextLink = oeb_book.spine[item.spine_position + 1].href
                    nextLink = relpath(os.path.abspath(nextLink), dir)
                else:
                    nextLink = None

                # generate link to previous page
                if item.spine_position > 0:
                    prevLink = oeb_book.spine[item.spine_position - 1].href
                    prevLink = relpath(os.path.abspath(prevLink), dir)
                else:
                    prevLink = None

                cssLink = relpath(os.path.abspath(css_path), dir)
                tocUrl = relpath(output_file, dir)
                firstContentPageLink = oeb_book.spine[0].href

                # render template
                templite = Templite(template_html_data)
                toc = lambda: self.generate_html_toc(oeb_book, path, output_dir
                                                     )
                t = templite.render(ebookContent=ebook_content,
                                    prevLink=prevLink,
                                    nextLink=nextLink,
                                    has_toc=bool(oeb_book.toc.count()),
                                    toc=toc,
                                    tocUrl=tocUrl,
                                    head_content=head_content,
                                    meta=meta,
                                    cssLink=cssLink,
                                    firstContentPageLink=firstContentPageLink)

                # write html to file
                with open(path, 'wb') as f:
                    f.write(t.encode('utf-8'))
                item.unload_data_from_memory(memory=path)

        zfile = zipfile.ZipFile(output_path, "w")
        zfile.add_dir(output_dir, os.path.basename(output_dir))
        zfile.write(output_file, os.path.basename(output_file),
                    zipfile.ZIP_DEFLATED)

        if opts.extract_to:
            if os.path.exists(opts.extract_to):
                shutil.rmtree(opts.extract_to)
            os.makedirs(opts.extract_to)
            zfile.extractall(opts.extract_to)
            self.log('Zip file extracted to', opts.extract_to)

        zfile.close()

        # cleanup temp dir
        shutil.rmtree(tempdir)
Example #15
0
    def extract_content(self, output_dir):
        from ebook_converter.ebooks.pml.pmlconverter import footnote_to_html, sidebar_to_html
        from ebook_converter.ebooks.pml.pmlconverter import PML_HTMLizer

        output_dir = os.path.abspath(output_dir)

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        title = self.mi.title
        if not isinstance(title, str):
            title = title.decode('utf-8', 'replace')
        html = '<html><head><title>%s</title></head><body>' % title

        pml = ''
        for i in range(1, self.header_record.num_text_pages + 1):
            self.log.debug('Extracting text page %i' % i)
            pml += self.get_text_page(i)
        hizer = PML_HTMLizer()
        html += hizer.parse_pml(pml, 'index.html')
        toc = hizer.get_toc()

        if self.header_record.footnote_count > 0:
            html += '<br /><h1>%s</h1>' % 'Footnotes'
            footnoteids = re.findall(
                '\\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding))
            for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_count)):
                self.log.debug('Extracting footnote page %i' % i)
                if fid < len(footnoteids):
                    fid = footnoteids[fid]
                else:
                    fid = ''
                html += footnote_to_html(fid, self.decompress_text(i))

        if self.header_record.sidebar_count > 0:
            html += '<br /><h1>%s</h1>' % 'Sidebar'
            sidebarids = re.findall(
                '\\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding))
            for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_count)):
                self.log.debug('Extracting sidebar page %i' % i)
                if sid < len(sidebarids):
                    sid = sidebarids[sid]
                else:
                    sid = ''
                html += sidebar_to_html(sid, self.decompress_text(i))

        html += '</body></html>'

        with CurrentDir(output_dir):
            with open('index.html', 'wb') as index:
                self.log.debug('Writing text to index.html')
                index.write(html.encode('utf-8'))

        if not os.path.exists(os.path.join(output_dir, 'images/')):
            os.makedirs(os.path.join(output_dir, 'images/'))
        images = []
        with CurrentDir(os.path.join(output_dir, 'images/')):
            for i in range(0, self.header_record.num_image_pages):
                name, img = self.get_image(self.header_record.image_data_offset + i)
                images.append(name)
                with open(name, 'wb') as imgf:
                    self.log.debug('Writing image %s to images/' % name)
                    imgf.write(img)

        opf_path = self.create_opf(output_dir, images, toc)

        return opf_path
Example #16
0
    def extract_content(self, output_dir):
        # Each text record is independent (unless the continuation
        # value is set in the previous record). Put each converted
        # text recored into a separate file. We will reference the
        # home.html file as the first file and let the HTML input
        # plugin assemble the order based on hyperlinks.
        with CurrentDir(output_dir):
            for uid, num in self.uid_text_secion_number.items():
                self.log.debug('Writing record with uid: %s as %s.html' % (uid, uid))
                with open('%s.html' % uid, 'wb') as htmlf:
                    html = u'<html><body>'
                    section_header, section_data = self.sections[num]
                    if section_header.type == DATATYPE_PHTML:
                        html += self.process_phtml(section_data.data, section_data.header.paragraph_offsets)
                    elif section_header.type == DATATYPE_PHTML_COMPRESSED:
                        d = self.decompress_phtml(section_data.data)
                        html += self.process_phtml(d, section_data.header.paragraph_offsets).decode(self.get_text_uid_encoding(section_header.uid), 'replace')
                    html += '</body></html>'
                    htmlf.write(html.encode('utf-8'))

        # Images.
        # Cache the image sizes in case they are used by a composite image.
        images = set()
        if not os.path.exists(os.path.join(output_dir, 'images/')):
            os.makedirs(os.path.join(output_dir, 'images/'))
        with CurrentDir(os.path.join(output_dir, 'images/')):
            # Single images.
            for uid, num in self.uid_image_section_number.items():
                section_header, section_data = self.sections[num]
                if section_data:
                    idata = None
                    if section_header.type == DATATYPE_TBMP:
                        idata = section_data
                    elif section_header.type == DATATYPE_TBMP_COMPRESSED:
                        if self.header_record.compression == 1:
                            idata = decompress_doc(section_data)
                        elif self.header_record.compression == 2:
                            idata = zlib.decompress(section_data)
                    try:
                        save_cover_data_to(idata, '%s.jpg' % uid, compression_quality=70)
                        images.add(uid)
                        self.log.debug('Wrote image with uid %s to images/%s.jpg' % (uid, uid))
                    except Exception as e:
                        self.log.error('Failed to write image with uid %s: %s' % (uid, e))
                else:
                    self.log.error('Failed to write image with uid %s: No data.' % uid)
            # Composite images.
            # We're going to use the already compressed .jpg images here.
            for uid, num in self.uid_composite_image_section_number.items():
                try:
                    section_header, section_data = self.sections[num]
                    # Get the final width and height.
                    width = 0
                    height = 0
                    for row in section_data.layout:
                        row_width = 0
                        col_height = 0
                        for col in row:
                            if col not in images:
                                raise Exception('Image with uid: %s missing.' % col)
                            w, h = identify(open('%s.jpg' % col, 'rb'))[1:]
                            row_width += w
                            if col_height < h:
                                col_height = h
                        if width < row_width:
                            width = row_width
                        height += col_height
                    # Create a new image the total size of all image
                    # parts. Put the parts into the new image.
                    with Canvas(width, height) as canvas:
                        y_off = 0
                        for row in section_data.layout:
                            x_off = 0
                            largest_height = 0
                            for col in row:
                                im = image_from_data(open('%s.jpg' % col, 'rb').read())
                                canvas.compose(im, x_off, y_off)
                                w, h = im.width(), im.height()
                                x_off += w
                                if largest_height < h:
                                    largest_height = h
                            y_off += largest_height
                    with open('%s.jpg' % uid) as out:
                        out.write(canvas.export(compression_quality=70))
                    self.log.debug('Wrote composite image with uid %s to images/%s.jpg' % (uid, uid))
                except Exception as e:
                    self.log.error('Failed to write composite image with uid %s: %s' % (uid, e))

        # Run the HTML through the html processing plugin.
        from ebook_converter.customize.ui import plugin_for_input_format
        html_input = plugin_for_input_format('html')
        for opt in html_input.options:
            setattr(self.options, opt.option.name, opt.recommended_value)
        self.options.input_encoding = 'utf-8'
        odi = self.options.debug_pipeline
        self.options.debug_pipeline = None
        # Determine the home.html record uid. This should be set in the
        # reserved values in the metadata recored. home.html is the first
        # text record (should have hyper link references to other records)
        # in the document.
        try:
            home_html = self.header_record.home_html
            if not home_html:
                home_html = self.uid_text_secion_number.items()[0][0]
        except:
            raise Exception('Could not determine home.html')
        # Generate oeb from html conversion.
        oeb = html_input.convert(open('%s.html' % home_html, 'rb'), self.options, 'html', self.log, {})
        self.options.debug_pipeline = odi

        return oeb