Ejemplo n.º 1
0
class DOCXInput(InputFormatPlugin):
    name = 'DOCX Input'
    author = 'Kovid Goyal'
    description = 'Convert DOCX files (.docx and .docm) to HTML'
    file_types = {'docx', 'docm'}
    commit_name = 'docx_input'

    options = {
        OptionRecommendation(
            name='docx_no_cover',
            recommended_value=False,
            help='Normally, if a large image is present at the start of the '
            'document that looks like a cover, it will be removed from '
            'the document and used as the cover for created e-book. This '
            'option turns off that behavior.'),
        OptionRecommendation(
            name='docx_no_pagebreaks_between_notes',
            recommended_value=False,
            help='Do not insert a page break after every endnote.'),
        OptionRecommendation(
            name='docx_inline_subsup',
            recommended_value=False,
            help='Render superscripts and subscripts so that they do not '
            'affect the line height.'),
    }

    recommendations = {('page_breaks_before', '/', OptionRecommendation.MED)}

    def convert(self, stream, options, file_ext, log, accelerators):
        from ebook_converter.ebooks.docx.to_html import Convert
        return Convert(stream,
                       detect_cover=not options.docx_no_cover,
                       log=log,
                       notes_nopb=options.docx_no_pagebreaks_between_notes,
                       nosupsub=options.docx_inline_subsup)()
Ejemplo n.º 2
0
class PDBOutput(OutputFormatPlugin):

    name = 'PDB Output'
    author = 'John Schember'
    file_type = 'pdb'
    commit_name = 'pdb_output'
    ui_data = {'formats': tuple(ALL_FORMAT_WRITERS)}

    options = {
        OptionRecommendation(
            name='format',
            recommended_value='doc',
            level=OptionRecommendation.LOW,
            short_switch='f',
            choices=list(ALL_FORMAT_WRITERS),
            help='Format to use inside the pdb container. Choices are: %s' %
            sorted(ALL_FORMAT_WRITERS)),
        OptionRecommendation(
            name='pdb_output_encoding',
            recommended_value='cp1252',
            level=OptionRecommendation.LOW,
            help='Specify the character encoding of the output document. '
            'The default is cp1252. Note: This option is not honored by '
            'all formats.'),
        OptionRecommendation(
            name='inline_toc',
            recommended_value=False,
            level=OptionRecommendation.LOW,
            help='Add Table of Contents to beginning of the book.'),
    }

    def convert(self, oeb_book, output_path, input_plugin, opts, log):
        close = False
        if not hasattr(output_path, 'write'):
            close = True
            if not os.path.exists(os.path.dirname(
                    output_path)) and os.path.dirname(output_path):
                os.makedirs(os.path.dirname(output_path))
            out_stream = open(output_path, 'wb')
        else:
            out_stream = output_path

        Writer = get_writer(opts.format)

        if Writer is None:
            raise PDBError('No writer available for format %s.' % format)

        setattr(opts, 'max_line_length', 0)
        setattr(opts, 'force_max_line_length', False)

        writer = Writer(opts, log)

        out_stream.seek(0)
        out_stream.truncate()

        writer.write_content(oeb_book, out_stream, oeb_book.metadata)

        if close:
            out_stream.close()
Ejemplo n.º 3
0
class RBOutput(OutputFormatPlugin):

    name = 'RB Output'
    author = 'John Schember'
    file_type = 'rb'
    commit_name = 'rb_output'

    options = {
        OptionRecommendation(name='inline_toc',
            recommended_value=False, level=OptionRecommendation.LOW,
            help='Add Table of Contents to beginning of the book.')}

    def convert(self, oeb_book, output_path, input_plugin, opts, log):
        from ebook_converter.ebooks.rb.writer import RBWriter

        close = False
        if not hasattr(output_path, 'write'):
            close = True
            if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path):
                os.makedirs(os.path.dirname(output_path))
            out_stream = open(output_path, 'wb')
        else:
            out_stream = output_path

        writer = RBWriter(opts, log)

        out_stream.seek(0)
        out_stream.truncate()

        writer.write_content(oeb_book, out_stream, oeb_book.metadata)

        if close:
            out_stream.close()
Ejemplo n.º 4
0
class TCROutput(OutputFormatPlugin):

    name = 'TCR Output'
    author = 'John Schember'
    file_type = 'tcr'
    commit_name = 'tcr_output'

    options = {
        OptionRecommendation(
            name='tcr_output_encoding',
            recommended_value='utf-8',
            level=OptionRecommendation.LOW,
            help='Specify the character encoding of the output document. '
            'The default is utf-8.')
    }

    def convert(self, oeb_book, output_path, input_plugin, opts, log):
        from ebook_converter.ebooks.txt.txtml import TXTMLizer
        from ebook_converter.ebooks.compression.tcr import compress

        close = False
        if not hasattr(output_path, 'write'):
            close = True
            if not os.path.exists(os.path.dirname(
                    output_path)) and os.path.dirname(output_path):
                os.makedirs(os.path.dirname(output_path))
            out_stream = open(output_path, 'wb')
        else:
            out_stream = output_path

        setattr(opts, 'flush_paras', False)
        setattr(opts, 'max_line_length', 0)
        setattr(opts, 'force_max_line_length', False)
        setattr(opts, 'indent_paras', False)

        writer = TXTMLizer(log)
        txt = writer.extract_content(oeb_book,
                                     opts).encode(opts.tcr_output_encoding,
                                                  'replace')

        log.info('Compressing text...')
        txt = compress(txt)

        out_stream.seek(0)
        out_stream.truncate()
        out_stream.write(txt)

        if close:
            out_stream.close()
Ejemplo n.º 5
0
class ComicInput(InputFormatPlugin):

    name = 'Comic Input'
    author = 'Kovid Goyal'
    description = 'Optimize comic files (.cbz, .cbr, .cbc) for viewing on portable devices'
    file_types = {'cbz', 'cbr', 'cbc'}
    is_image_collection = True
    commit_name = 'comic_input'
    core_usage = -1

    options = {
        OptionRecommendation(
            name='colors',
            recommended_value=0,
            help='Reduce the number of colors used in the image. This works '
            'only if you choose the PNG output format. It is useful to '
            'reduce file sizes. Set to zero to turn off. Maximum value '
            'is 256. It is off by default.'),
        OptionRecommendation(
            name='dont_normalize',
            recommended_value=False,
            help='Disable normalize (improve contrast) color range '
            'for pictures. Default: False'),
        OptionRecommendation(
            name='keep_aspect_ratio',
            recommended_value=False,
            help='Maintain picture aspect ratio. Default is to fill the '
            'screen.'),
        OptionRecommendation(name='dont_sharpen',
                             recommended_value=False,
                             help='Disable sharpening.'),
        OptionRecommendation(
            name='disable_trim',
            recommended_value=False,
            help='Disable trimming of comic pages. For some comics, trimming '
            'might remove content as well as borders.'),
        OptionRecommendation(
            name='landscape',
            recommended_value=False,
            help="Don't split landscape images into two portrait images"),
        OptionRecommendation(
            name='wide',
            recommended_value=False,
            help="Keep aspect ratio and scale image using screen height as "
            "image width for viewing in landscape mode."),
        OptionRecommendation(
            name='right2left',
            recommended_value=False,
            help='Used for right-to-left publications like manga. '
            'Causes landscape pages to be split into portrait pages '
            'from right to left.'),
        OptionRecommendation(
            name='despeckle',
            recommended_value=False,
            help='Enable Despeckle. Reduces speckle noise. May greatly '
            'increase processing time.'),
        OptionRecommendation(
            name='no_sort',
            recommended_value=False,
            help="Don't sort the files found in the comic "
            "alphabetically by name. Instead use the order they were "
            "added to the comic."),
        OptionRecommendation(
            name='output_format',
            choices=['png', 'jpg'],
            recommended_value='png',
            help='The format that images in the created e-book are '
            'converted to. You can experiment to see which format '
            'gives you optimal size and look on your device.'),
        OptionRecommendation(name='no_process',
                             recommended_value=False,
                             help="Apply no processing to the image"),
        OptionRecommendation(
            name='dont_grayscale',
            recommended_value=False,
            help='Do not convert the image to grayscale (black and white)'),
        OptionRecommendation(
            name='comic_image_size',
            recommended_value=None,
            help='Specify the image size as widthxheight pixels. Normally,'
            ' an image size is automatically calculated from the output '
            'profile, this option overrides it.'),
        OptionRecommendation(
            name='dont_add_comic_pages_to_toc',
            recommended_value=False,
            help='When converting a CBC do not add links to each page to'
            ' the TOC. Note this only applies if the TOC has more than '
            'one section'),
    }

    recommendations = {
        ('margin_left', 0, OptionRecommendation.HIGH),
        ('margin_top', 0, OptionRecommendation.HIGH),
        ('margin_right', 0, OptionRecommendation.HIGH),
        ('margin_bottom', 0, OptionRecommendation.HIGH),
        ('insert_blank_line', False, OptionRecommendation.HIGH),
        ('remove_paragraph_spacing', False, OptionRecommendation.HIGH),
        ('change_justification', 'left', OptionRecommendation.HIGH),
        ('dont_split_on_pagebreaks', True, OptionRecommendation.HIGH),
        ('chapter', None, OptionRecommendation.HIGH),
        ('page_breaks_brefore', None, OptionRecommendation.HIGH),
        ('use_auto_toc', False, OptionRecommendation.HIGH),
        ('page_breaks_before', None, OptionRecommendation.HIGH),
        ('disable_font_rescaling', True, OptionRecommendation.HIGH),
        ('linearize_tables', False, OptionRecommendation.HIGH),
    }

    def get_comics_from_collection(self, stream):
        from ebook_converter.libunzip import extract as zipextract
        tdir = PersistentTemporaryDirectory('_comic_collection')
        zipextract(stream, tdir)
        comics = []
        with CurrentDir(tdir):
            if not os.path.exists('comics.txt'):
                raise ValueError(
                    ('%s is not a valid comic collection'
                     ' no comics.txt was found in the file') % stream.name)
            with open('comics.txt', 'rb') as f:
                raw = f.read()
            if raw.startswith(codecs.BOM_UTF16_BE):
                raw = raw.decode('utf-16-be')[1:]
            elif raw.startswith(codecs.BOM_UTF16_LE):
                raw = raw.decode('utf-16-le')[1:]
            elif raw.startswith(codecs.BOM_UTF8):
                raw = raw.decode('utf-8')[1:]
            else:
                raw = raw.decode('utf-8')
            for line in raw.splitlines():
                line = line.strip()
                if not line:
                    continue
                fname, title = line.partition(':')[0], line.partition(':')[-1]
                fname = fname.replace('#', '_')
                fname = os.path.join(tdir, *fname.split('/'))
                if not title:
                    title = os.path.basename(fname).rpartition('.')[0]
                if os.access(fname, os.R_OK):
                    comics.append([title, fname])
        if not comics:
            raise ValueError('%s has no comics' % stream.name)
        return comics

    def get_pages(self, comic, tdir2):
        from ebook_converter.ebooks.comic.input import (extract_comic,
                                                        process_pages,
                                                        find_pages)
        tdir = extract_comic(comic)
        new_pages = find_pages(tdir,
                               sort_on_mtime=self.opts.no_sort,
                               verbose=self.opts.verbose)
        thumbnail = None
        if not new_pages:
            raise ValueError('Could not find any pages in the comic: %s' %
                             comic)
        if self.opts.no_process:
            n2 = []
            for i, page in enumerate(new_pages):
                n2.append(
                    os.path.join(tdir2,
                                 '{} - {}'.format(i, os.path.basename(page))))
                shutil.copyfile(page, n2[-1])
            new_pages = n2
        else:
            new_pages, failures = process_pages(new_pages, self.opts,
                                                self.report_progress, tdir2)
            if failures:
                self.log.warning('Could not process the following pages '
                                 '(run with --verbose to see why):')
                for f in failures:
                    self.log.warning('\t', f)
            if not new_pages:
                raise ValueError(
                    'Could not find any valid pages in comic: %s' % comic)
            thumbnail = os.path.join(
                tdir2, 'thumbnail.' + self.opts.output_format.lower())
            if not os.access(thumbnail, os.R_OK):
                thumbnail = None
        return new_pages

    def get_images(self):
        return self._images

    def convert(self, stream, opts, file_ext, log, accelerators):
        from ebook_converter.ebooks.metadata import MetaInformation
        from ebook_converter.ebooks.metadata.opf2 import OPFCreator
        from ebook_converter.ebooks.metadata.toc import TOC

        self.opts, self.log = opts, log
        if file_ext == 'cbc':
            comics_ = self.get_comics_from_collection(stream)
        else:
            comics_ = [['Comic', os.path.abspath(stream.name)]]
        stream.close()
        comics = []
        for i, x in enumerate(comics_):
            title, fname = x
            cdir = 'comic_%d' % (i + 1) if len(comics_) > 1 else '.'
            cdir = os.path.abspath(cdir)
            if not os.path.exists(cdir):
                os.makedirs(cdir)
            pages = self.get_pages(fname, cdir)
            if not pages:
                continue
            if self.for_viewer:
                comics.append(
                    (title, pages, [self.create_viewer_wrapper(pages)]))
            else:
                wrappers = self.create_wrappers(pages)
                comics.append((title, pages, wrappers))

        if not comics:
            raise ValueError('No comic pages found in %s' % stream.name)

        mi = MetaInformation(
            os.path.basename(stream.name).rpartition('.')[0], ['Unknown'])
        opf = OPFCreator(os.getcwd(), mi)
        entries = []

        def href(x):
            if len(comics) == 1:
                return os.path.basename(x)
            return '/'.join(x.split(os.sep)[-2:])

        cover_href = None
        for comic in comics:
            pages, wrappers = comic[1:]
            page_entries = [(x, None) for x in map(href, pages)]
            entries += [(w, None) for w in map(href, wrappers)] + page_entries
            if cover_href is None and page_entries:
                cover_href = page_entries[0][0]
        opf.create_manifest(entries)
        spine = []
        for comic in comics:
            spine.extend(map(href, comic[2]))
        self._images = []
        for comic in comics:
            self._images.extend(comic[1])
        opf.create_spine(spine)
        if self.for_viewer and cover_href:
            opf.guide.set_cover(cover_href)
        toc = TOC()
        if len(comics) == 1:
            wrappers = comics[0][2]
            for i, x in enumerate(wrappers):
                toc.add_item(href(x), None, 'Page %d' % (i + 1), play_order=i)
        else:
            po = 0
            for comic in comics:
                po += 1
                wrappers = comic[2]
                stoc = toc.add_item(href(wrappers[0]),
                                    None,
                                    comic[0],
                                    play_order=po)
                if not opts.dont_add_comic_pages_to_toc:
                    for i, x in enumerate(wrappers):
                        stoc.add_item(href(x),
                                      None,
                                      'Page %d' % (i + 1),
                                      play_order=po)
                        po += 1
        opf.set_toc(toc)
        with open('metadata.opf', 'wb') as m, open('toc.ncx', 'wb') as n:
            opf.render(m, n, 'toc.ncx')
        return os.path.abspath('metadata.opf')

    def create_wrappers(self, pages):
        wrappers = []
        WRAPPER = textwrap.dedent('''\
        <html xmlns="%s">
            <head>
                <meta charset="utf-8"/>
                <title>Page #%d</title>
                <style type="text/css">
                    @page { margin:0pt; padding: 0pt}
                    body { margin: 0pt; padding: 0pt}
                    div { text-align: center }
                </style>
            </head>
            <body>
                <div>
                    <img src="%s" alt="comic page #%d" />
                </div>
            </body>
        </html>
        ''')
        dir = os.path.dirname(pages[0])
        for i, page in enumerate(pages):
            wrapper = WRAPPER % (const.XHTML_NS, i + 1, os.path.basename(page),
                                 i + 1)
            page = os.path.join(dir, 'page_%d.xhtml' % (i + 1))
            with open(page, 'wb') as f:
                f.write(wrapper.encode('utf-8'))
            wrappers.append(page)
        return wrappers

    def create_viewer_wrapper(self, pages):
        def page(src):
            return '<img src="{}"></img>'.format(os.path.basename(src))

        pages = '\n'.join(map(page, pages))
        base = os.path.dirname(pages[0])
        wrapper = '''
        <html xmlns="%s">
            <head>
                <meta charset="utf-8"/>
                <style type="text/css">
                html, body, img { height: 100vh; display: block; margin: 0; padding: 0; border-width: 0; }
                img {
                    width: 100%%; height: 100%%;
                    object-fit: contain;
                    margin-left: auto; margin-right: auto;
                    max-width: 100vw; max-height: 100vh;
                    top: 50vh; transform: translateY(-50%%);
                    position: relative;
                    page-break-after: always;
                }
                </style>
            </head>
            <body>
            %s
            </body>
        </html>
        ''' % (const.XHTML_NS, pages)
        path = os.path.join(base, 'wrapper.xhtml')
        with open(path, 'wb') as f:
            f.write(wrapper.encode('utf-8'))
        return path
Ejemplo n.º 6
0
class RTFInput(InputFormatPlugin):

    name = 'RTF Input'
    author = 'Kovid Goyal'
    description = 'Convert RTF files to HTML'
    file_types = {'rtf'}
    commit_name = 'rtf_input'

    options = {
        OptionRecommendation(name='ignore_wmf',
                             recommended_value=False,
                             help='Ignore WMF images instead of '
                             'replacing them with a placeholder '
                             'image.')
    }

    def generate_xml(self, stream):
        from ebook_converter.ebooks.rtf2xml.ParseRtf import ParseRtf
        ofile = u'dataxml.xml'
        run_lev, debug_dir, indent_out = 1, None, 0
        if getattr(self.opts, 'debug_pipeline', None) is not None:
            try:
                os.mkdir(u'rtfdebug')
                debug_dir = u'rtfdebug'
                run_lev = 4
                indent_out = 1
                self.log('Running RTFParser in debug mode')
            except Exception:
                self.log.warn('Impossible to run RTFParser in debug mode')
        parser = ParseRtf(
            in_file=stream,
            out_file=ofile,
            # Convert symbol fonts to unicode equivalents. Default
            # is 1
            convert_symbol=1,

            # Convert Zapf fonts to unicode equivalents. Default
            # is 1.
            convert_zapf=1,

            # Convert Wingding fonts to unicode equivalents.
            # Default is 1.
            convert_wingdings=1,

            # Convert RTF caps to real caps.
            # Default is 1.
            convert_caps=1,

            # Indent resulting XML.
            # Default is 0 (no indent).
            indent=indent_out,

            # Form lists from RTF. Default is 1.
            form_lists=1,

            # Convert headings to sections. Default is 0.
            headings_to_sections=1,

            # Group paragraphs with the same style name. Default is 1.
            group_styles=1,

            # Group borders. Default is 1.
            group_borders=1,

            # Write or do not write paragraphs. Default is 0.
            empty_paragraphs=1,

            # Debug
            deb_dir=debug_dir,

            # Default encoding
            default_encoding=getattr(self.opts, 'input_encoding', 'cp1252')
            or 'cp1252',

            # Run level
            run_level=run_lev,
        )
        parser.parse_rtf()
        with open(ofile, 'rb') as f:
            return f.read()

    def extract_images(self, picts):
        from ebook_converter.utils.imghdr import what
        from binascii import unhexlify
        self.log('Extracting images...')

        with open(picts, 'rb') as f:
            raw = f.read()
        picts = filter(len, re.findall(br'\{\\pict([^}]+)\}', raw))
        hex_pat = re.compile(br'[^a-fA-F0-9]')
        encs = [hex_pat.sub(b'', pict) for pict in picts]

        count = 0
        imap = {}
        for enc in encs:
            if len(enc) % 2 == 1:
                enc = enc[:-1]
            data = unhexlify(enc)
            fmt = what(None, data)
            if fmt is None:
                fmt = 'wmf'
            count += 1
            name = u'%04d.%s' % (count, fmt)
            with open(name, 'wb') as f:
                f.write(data)
            imap[count] = name
            # with open(name+'.hex', 'wb') as f:
            #     f.write(enc)
        return self.convert_images(imap)

    def convert_images(self, imap):
        self.default_img = None
        for count, val in imap.items():
            try:
                imap[count] = self.convert_image(val)
            except Exception:
                self.log.exception('Failed to convert', val)
        return imap

    def convert_image(self, name):
        if not name.endswith('.wmf'):
            return name
        try:
            return self.rasterize_wmf(name)
        except Exception:
            self.log.exception('Failed to convert WMF image %r' % name)
        return self.replace_wmf(name)

    def replace_wmf(self, name):
        if self.opts.ignore_wmf:
            os.remove(name)
            return '__REMOVE_ME__'
        from ebook_converter.ebooks.covers import message_image
        if self.default_img is None:
            self.default_img = message_image('Conversion of WMF images is not '
                                             'supported. Use Microsoft Word '
                                             'or OpenOffice to save this RTF '
                                             'file as HTML and convert that '
                                             'in calibre.')
        name = name.replace('.wmf', '.jpg')
        with open(name, 'wb') as f:
            f.write(self.default_img)
        return name

    def rasterize_wmf(self, name):
        from ebook_converter.utils.wmf.parse import wmf_unwrap
        with open(name, 'rb') as f:
            data = f.read()
        data = wmf_unwrap(data)
        name = name.replace('.wmf', '.png')
        with open(name, 'wb') as f:
            f.write(data)
        return name

    def write_inline_css(self, ic, border_styles):
        font_size_classes = [
            'span.fs%d { font-size: %spt }' % (i, x)
            for i, x in enumerate(ic.font_sizes)
        ]
        color_classes = [
            'span.col%d { color: %s }' % (i, x)
            for i, x in enumerate(ic.colors) if x != 'false'
        ]
        css = textwrap.dedent('''
        span.none {
            text-decoration: none; font-weight: normal;
            font-style: normal; font-variant: normal
        }

        span.italics { font-style: italic }

        span.bold { font-weight: bold }

        span.small-caps { font-variant: small-caps }

        span.underlined { text-decoration: underline }

        span.strike-through { text-decoration: line-through }

        ''')
        css += '\n' + '\n'.join(font_size_classes)
        css += '\n' + '\n'.join(color_classes)

        for cls, val in border_styles.items():
            css += '\n\n.%s {\n%s\n}' % (cls, val)

        with open(u'styles.css', 'ab') as f:
            f.write(css.encode('utf-8'))

    def convert_borders(self, doc):
        border_styles = []
        style_map = {}
        for elem in doc.xpath(r'//*[local-name()="cell"]'):
            style = [
                'border-style: hidden', 'border-width: 1px',
                'border-color: black'
            ]
            for x in ('bottom', 'top', 'left', 'right'):
                bs = elem.get('border-cell-%s-style' % x, None)
                if bs:
                    cbs = border_style_map.get(bs, 'solid')
                    style.append('border-%s-style: %s' % (x, cbs))
                bw = elem.get('border-cell-%s-line-width' % x, None)
                if bw:
                    style.append('border-%s-width: %spt' % (x, bw))
                bc = elem.get('border-cell-%s-color' % x, None)
                if bc:
                    style.append('border-%s-color: %s' % (x, bc))
            style = ';\n'.join(style)
            if style not in border_styles:
                border_styles.append(style)
            idx = border_styles.index(style)
            cls = 'border_style%d' % idx
            style_map[cls] = style
            elem.set('class', cls)
        return style_map

    def convert(self, stream, options, file_ext, log, accelerators):
        from ebook_converter.ebooks.metadata.meta import get_metadata
        from ebook_converter.ebooks.metadata.opf2 import OPFCreator
        from ebook_converter.ebooks.rtf2xml.ParseRtf import \
            RtfInvalidCodeException
        from ebook_converter.ebooks.rtf.input import InlineClass
        self.opts = options
        self.log = log
        self.log('Converting RTF to XML...')
        try:
            xml = self.generate_xml(stream.name)
        except RtfInvalidCodeException as e:
            self.log.exception('Unable to parse RTF')
            raise ValueError('This RTF file has a feature calibre does not '
                             'support. Convert it to HTML first and then try '
                             'it.\n%s' % e)

        d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf'))
        if d:
            imap = {}
            try:
                imap = self.extract_images(d[0])
            except Exception:
                self.log.exception('Failed to extract images...')

        self.log('Parsing XML...')
        doc = etree.fromstring(xml)
        border_styles = self.convert_borders(doc)
        for pict in doc.xpath(
                '//rtf:pict[@num]',
                namespaces={'rtf': 'http://rtf2xml.sourceforge.net/'}):
            num = int(pict.get('num'))
            name = imap.get(num, None)
            if name is not None:
                pict.set('num', name)

        self.log('Converting XML to HTML...')
        inline_class = InlineClass(self.log)
        with open(
                pkg_resources.resource_filename('ebook_converter',
                                                'data/rtf.xsl')) as fobj:
            styledoc = etree.fromstring(fobj.read())
        extensions = {('calibre', 'inline-class'): inline_class}
        transform = etree.XSLT(styledoc, extensions=extensions)
        result = transform(doc)
        html = u'index.xhtml'
        with open(html, 'wb') as f:
            res = as_bytes(transform.tostring(result))
            # res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
            # clean multiple \n
            res = re.sub(b'\n+', b'\n', res)
            # Replace newlines inserted by the 'empty_paragraphs' option in
            # rtf2xml with html blank lines
            # res = re.sub('\s*<body>', '<body>', res)
            # res = re.sub('(?<=\n)\n{2}',
            # u'<p>\u00a0</p>\n'.encode('utf-8'), res)
            f.write(res)
        self.write_inline_css(inline_class, border_styles)
        stream.seek(0)
        mi = get_metadata(stream, 'rtf')
        if not mi.title:
            mi.title = 'Unknown'
        if not mi.authors:
            mi.authors = ['Unknown']
        opf = OPFCreator(os.getcwd(), mi)
        opf.create_manifest([(u'index.xhtml', None)])
        opf.create_spine([u'index.xhtml'])
        opf.render(open(u'metadata.opf', 'wb'))
        return os.path.abspath(u'metadata.opf')

    def postprocess_book(self, oeb, opts, log):
        for item in oeb.spine:
            for img in item.data.xpath('//*[local-name()="img" and '
                                       '@src="__REMOVE_ME__"]'):
                p = img.getparent()
                idx = p.index(img)
                p.remove(img)
                if img.tail:
                    if idx == 0:
                        p.text = (p.text or '') + img.tail
                    else:
                        p[idx - 1].tail = (p[idx - 1].tail or '') + img.tail
Ejemplo n.º 7
0
class PDFOutput(OutputFormatPlugin):

    name = 'PDF Output'
    author = 'Kovid Goyal'
    file_type = 'pdf'
    commit_name = 'pdf_output'
    ui_data = {
        'paper_sizes': PAPER_SIZES,
        'units': UNITS,
        'font_types': ('serif', 'sans', 'mono')
    }

    options = {
        OptionRecommendation(
            name='use_profile_size',
            recommended_value=False,
            help=
            'Instead of using the paper size specified in the PDF Output options,'
            ' use a paper size corresponding to the current output profile.'
            ' Useful if you want to generate a PDF for viewing on a specific device.'
        ),
        OptionRecommendation(
            name='unit',
            recommended_value='inch',
            level=OptionRecommendation.LOW,
            short_switch='u',
            choices=UNITS,
            help='The unit of measure for page sizes. Default is inch. Choices '
            'are {} '
            'Note: This does not override the unit for margins!'.format(
                ', '.join(UNITS))),
        OptionRecommendation(
            name='paper_size',
            recommended_value='letter',
            level=OptionRecommendation.LOW,
            choices=PAPER_SIZES,
            help='The size of the paper. This size will be overridden when a '
            'non default output profile is used. Default is letter. Choices '
            'are {}'.format(', '.join(PAPER_SIZES))),
        OptionRecommendation(
            name='custom_size',
            recommended_value=None,
            help='Custom size of the document. Use the form widthxheight '
            'e.g. `123x321` to specify the width and height. '
            'This overrides any specified paper-size.'),
        OptionRecommendation(
            name='preserve_cover_aspect_ratio',
            recommended_value=False,
            help='Preserve the aspect ratio of the cover, instead'
            ' of stretching it to fill the full first page of the'
            ' generated pdf.'),
        OptionRecommendation(
            name='pdf_serif_family',
            recommended_value='Times',
            help=
            'The font family used to render serif fonts. Will work only if the font is available system-wide.'
        ),
        OptionRecommendation(
            name='pdf_sans_family',
            recommended_value='Helvetica',
            help=
            'The font family used to render sans-serif fonts. Will work only if the font is available system-wide.'
        ),
        OptionRecommendation(
            name='pdf_mono_family',
            recommended_value='Courier',
            help=
            'The font family used to render monospace fonts. Will work only if the font is available system-wide.'
        ),
        OptionRecommendation(
            name='pdf_standard_font',
            choices=ui_data['font_types'],
            recommended_value='serif',
            help='The font family used to render monospace fonts'),
        OptionRecommendation(name='pdf_default_font_size',
                             recommended_value=20,
                             help='The default font size'),
        OptionRecommendation(name='pdf_mono_font_size',
                             recommended_value=16,
                             help='The default font size for monospaced text'),
        OptionRecommendation(
            name='pdf_hyphenate',
            recommended_value=False,
            help=
            'Break long words at the end of lines. This can give the text at the right margin a more even appearance.'
        ),
        OptionRecommendation(
            name='pdf_mark_links',
            recommended_value=False,
            help='Surround all links with a red box, useful for debugging.'),
        OptionRecommendation(
            name='pdf_page_numbers',
            recommended_value=False,
            help=
            'Add page numbers to the bottom of every page in the generated PDF file. If you '
            'specify a footer template, it will take precedence '
            'over this option.'),
        OptionRecommendation(
            name='pdf_footer_template',
            recommended_value=None,
            help='An HTML template used to generate %s on every page.'
            ' The strings _PAGENUM_, _TITLE_, _AUTHOR_ and _SECTION_ will be replaced by their current values.'
            % 'footers'),
        OptionRecommendation(
            name='pdf_header_template',
            recommended_value=None,
            help='An HTML template used to generate %s on every page.'
            ' The strings _PAGENUM_, _TITLE_, _AUTHOR_ and _SECTION_ will be replaced by their current values.'
            % 'headers'),
        OptionRecommendation(
            name='pdf_add_toc',
            recommended_value=False,
            help=
            'Add a Table of Contents at the end of the PDF that lists page numbers. '
            'Useful if you want to print out the PDF. If this PDF is intended for electronic use, use the PDF Outline instead.'
        ),
        OptionRecommendation(name='toc_title',
                             recommended_value=None,
                             help='Title for generated table of contents.'),
        OptionRecommendation(
            name='pdf_page_margin_left',
            recommended_value=72.0,
            level=OptionRecommendation.LOW,
            help='The size of the left page margin, in pts. Default is 72pt.'
            ' Overrides the common left page margin setting.'),
        OptionRecommendation(
            name='pdf_page_margin_top',
            recommended_value=72.0,
            level=OptionRecommendation.LOW,
            help='The size of the top page margin, in pts. Default is 72pt.'
            ' Overrides the common top page margin setting, unless set to zero.'
        ),
        OptionRecommendation(
            name='pdf_page_margin_right',
            recommended_value=72.0,
            level=OptionRecommendation.LOW,
            help='The size of the right page margin, in pts. Default is 72pt.'
            ' Overrides the common right page margin setting, unless set to zero.'
        ),
        OptionRecommendation(
            name='pdf_page_margin_bottom',
            recommended_value=72.0,
            level=OptionRecommendation.LOW,
            help='The size of the bottom page margin, in pts. Default is 72pt.'
            ' Overrides the common bottom page margin setting, unless set to zero.'
        ),
        OptionRecommendation(
            name='pdf_use_document_margins',
            recommended_value=False,
            help=
            'Use the page margins specified in the input document via @page CSS rules.'
            ' This will cause the margins specified in the conversion settings to be ignored.'
            ' If the document does not specify page margins, the conversion settings will be used as a fallback.'
        ),
        OptionRecommendation(
            name='pdf_page_number_map',
            recommended_value=None,
            help=
            'Adjust page numbers, as needed. Syntax is a JavaScript expression for the page number.'
            ' For example, "if (n < 3) 0; else n - 3;", where n is current page number.'
        ),
        OptionRecommendation(
            name='uncompressed_pdf',
            recommended_value=False,
            help='Generate an uncompressed PDF, useful for debugging.'),
        OptionRecommendation(
            name='pdf_odd_even_offset',
            recommended_value=0.0,
            level=OptionRecommendation.LOW,
            help='Shift the text horizontally by the specified offset (in pts).'
            ' On odd numbered pages, it is shifted to the right and on even'
            ' numbered pages to the left. Use negative numbers for the opposite'
            ' effect. Note that this setting is ignored on pages where the margins'
            ' are smaller than the specified offset. Shifting is done by setting'
            ' the PDF CropBox, not all software respects the CropBox.')
    }

    def specialize_options(self, log, opts, input_fmt):
        # Ensure Qt is setup to be used with WebEngine
        # specialize_options is called early enough in the pipeline
        # that hopefully no Qt application has been constructed as yet
        from PyQt5.QtWebEngineCore import QWebEngineUrlScheme
        from PyQt5.QtWebEngineWidgets import QWebEnginePage  # noqa
        from ebook_converter.gui2 import must_use_qt
        from ebook_converter.constants_old import FAKE_PROTOCOL
        scheme = QWebEngineUrlScheme(FAKE_PROTOCOL.encode('ascii'))
        scheme.setSyntax(QWebEngineUrlScheme.Syntax.Host)
        scheme.setFlags(QWebEngineUrlScheme.SecureScheme)
        QWebEngineUrlScheme.registerScheme(scheme)
        must_use_qt()
        self.input_fmt = input_fmt

        if opts.pdf_use_document_margins:
            # Prevent the conversion pipeline from overwriting document margins
            opts.margin_left = opts.margin_right = opts.margin_top = opts.margin_bottom = -1

    def convert(self, oeb_book, output_path, input_plugin, opts, log):
        self.stored_page_margins = getattr(opts, '_stored_page_margins', {})

        self.oeb = oeb_book
        self.input_plugin, self.opts, self.log = input_plugin, opts, log
        self.output_path = output_path
        from ebook_converter.ebooks.oeb.base import OPF, OPF2_NS
        from lxml import etree
        from io import BytesIO
        package = etree.Element(OPF('package'),
                                attrib={
                                    'version': '2.0',
                                    'unique-identifier': 'dummy'
                                },
                                nsmap={None: OPF2_NS})
        from ebook_converter.ebooks.metadata.opf2 import OPF
        self.oeb.metadata.to_opf2(package)
        self.metadata = OPF(BytesIO(
            etree.tostring(package))).to_book_metadata()
        self.cover_data = None

        if input_plugin.is_image_collection:
            log.debug('Converting input as an image collection...')
            self.convert_images(input_plugin.get_images())
        else:
            log.debug('Converting input as a text based book...')
            self.convert_text(oeb_book)

    def convert_images(self, images):
        from ebook_converter.ebooks.pdf.image_writer import convert
        convert(images, self.output_path, self.opts, self.metadata,
                self.report_progress)

    def get_cover_data(self):
        oeb = self.oeb
        if (oeb.metadata.cover
                and str(oeb.metadata.cover[0]) in oeb.manifest.ids):
            cover_id = str(oeb.metadata.cover[0])
            item = oeb.manifest.ids[cover_id]
            self.cover_data = item.data

    def process_fonts(self):
        ''' Make sure all fonts are embeddable '''
        from ebook_converter.ebooks.oeb.base import urlnormalize
        from ebook_converter.utils.fonts.utils import remove_embed_restriction

        processed = set()
        for item in list(self.oeb.manifest):
            if not hasattr(item.data, 'cssRules'):
                continue
            for i, rule in enumerate(item.data.cssRules):
                if rule.type == rule.FONT_FACE_RULE:
                    try:
                        s = rule.style
                        src = s.getProperty('src').propertyValue[0].uri
                    except:
                        continue
                    path = item.abshref(src)
                    ff = self.oeb.manifest.hrefs.get(urlnormalize(path), None)
                    if ff is None:
                        continue

                    raw = nraw = ff.data
                    if path not in processed:
                        processed.add(path)
                        try:
                            nraw = remove_embed_restriction(raw)
                        except:
                            continue
                        if nraw != raw:
                            ff.data = nraw
                            self.oeb.container.write(path, nraw)

    def convert_text(self, oeb_book):
        import json
        from ebook_converter.ebooks.pdf.html_writer import convert
        self.get_cover_data()
        self.process_fonts()

        if self.opts.pdf_use_document_margins and self.stored_page_margins:
            for href, margins in self.stored_page_margins.items():
                item = oeb_book.manifest.hrefs.get(href)
                if item is not None:
                    root = item.data
                    if hasattr(root, 'xpath') and margins:
                        root.set('data-calibre-pdf-output-page-margins',
                                 json.dumps(margins))

        with TemporaryDirectory('_pdf_out') as oeb_dir:
            from ebook_converter.customize.ui import plugin_for_output_format
            oeb_dir = os.path.realpath(oeb_dir)
            oeb_output = plugin_for_output_format('oeb')
            oeb_output.convert(oeb_book, oeb_dir, self.input_plugin, self.opts,
                               self.log)
            opfpath = glob.glob(os.path.join(oeb_dir, '*.opf'))[0]
            convert(opfpath,
                    self.opts,
                    metadata=self.metadata,
                    output_path=self.output_path,
                    log=self.log,
                    cover_data=self.cover_data,
                    report_progress=self.report_progress)
Ejemplo n.º 8
0
class TXTInput(InputFormatPlugin):

    name = 'TXT Input'
    author = 'John Schember'
    description = 'Convert TXT files to HTML'
    file_types = {'txt', 'txtz', 'text', 'md', 'textile', 'markdown'}
    commit_name = 'txt_input'
    ui_data = {
        'md_extensions': MD_EXTENSIONS,
        'paragraph_types': {
            'auto': 'Try to auto detect paragraph type',
            'block': 'Treat a blank line as a paragraph break',
            'single': 'Assume every line is a paragraph',
            'print': 'Assume every line starting with 2+ spaces or a tab '
            'starts a paragraph',
            'unformatted': 'Most lines have hard line breaks, few/no blank '
            'lines or indents',
            'off': 'Don\'t modify the paragraph structure',
        },
        'formatting_types': {
            'auto': 'Automatically decide which formatting processor to use',
            'plain': 'No formatting',
            'heuristic': 'Use heuristics to determine chapter headings, '
            'italics, etc.',
            'textile': 'Use the TexTile markup language',
            'markdown': 'Use the Markdown markup language'
        },
    }

    options = {
        OptionRecommendation(name='formatting_type',
                             recommended_value='auto',
                             choices=list(ui_data['formatting_types']),
                             help='Formatting used within the document.\n'
                             '* auto: {auto}\n'
                             '* plain: {plain}\n'
                             '* heuristic: {heuristic}\n'
                             '* textile: {textile}\n'
                             '* markdown: {markdown}\n'
                             'To learn more about markdown see '
                             '{url}'.format(
                                 url='https://daringfireball.net/projects/'
                                 'markdown/',
                                 **ui_data['formatting_types'])),
        OptionRecommendation(
            name='paragraph_type',
            recommended_value='auto',
            choices=list(ui_data['paragraph_types']),
            help='Paragraph structure to assume. The value of "off" is useful '
            'for formatted documents such as Markdown or Textile. '
            'Choices are:\n'
            '* auto: {auto}\n'
            '* block: {block}\n'
            '* single: {single}\n'
            '* print:  {print}\n'
            '* unformatted: {unformatted}\n'
            '* off: {off}'.format(**ui_data['paragraph_types'])),
        OptionRecommendation(
            name='preserve_spaces',
            recommended_value=False,
            help='Normally extra spaces are condensed into a single space. '
            'With this option all spaces will be displayed.'),
        OptionRecommendation(
            name='txt_in_remove_indents',
            recommended_value=False,
            help='Normally extra space at the beginning of lines is retained. '
            'With this option they will be removed.'),
        OptionRecommendation(
            name="markdown_extensions",
            recommended_value='footnotes, tables, toc',
            help='Enable extensions to markdown syntax. Extensions are '
            'formatting that is not part of the standard markdown '
            'format. The extensions enabled by default: %default.\nTo '
            'learn more about markdown extensions, see {}\nThis should '
            'be a comma separated list of extensions to enable:'
            '\n'.format('https://python-markdown.github.io/extensions/') +
            '\n'.join('* %s: %s' % (k, MD_EXTENSIONS[k])
                      for k in sorted(MD_EXTENSIONS))),
    }

    def shift_file(self, fname, data):
        name, ext = os.path.splitext(fname)
        candidate = os.path.join(self.output_dir, fname)
        c = 0
        while os.path.exists(candidate):
            c += 1
            candidate = os.path.join(self.output_dir,
                                     '{}-{}{}'.format(name, c, ext))
        ans = candidate
        with open(ans, 'wb') as f:
            f.write(data)
        return f.name

    def fix_resources(self, html, base_dir):
        from html5_parser import parse
        root = parse(html)
        changed = False
        for img in root.xpath('//img[@src]'):
            src = img.get('src')
            prefix = src.split(':', 1)[0].lower()
            if prefix not in ('file', 'http', 'https',
                              'ftp') and not os.path.isabs(src):
                src = os.path.join(base_dir, src)
                if os.access(src, os.R_OK):
                    with open(src, 'rb') as f:
                        data = f.read()
                    f = self.shift_file(os.path.basename(src), data)
                    changed = True
                    img.set('src', os.path.basename(f))
        if changed:
            from lxml import etree
            html = etree.tostring(root, encoding='unicode')
        return html

    def convert(self, stream, options, file_ext, log, accelerators):
        from ebook_converter.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
        from ebook_converter.ebooks.chardet import detect
        from ebook_converter.utils.zipfile import ZipFile
        from ebook_converter.ebooks.txt.processor import (
            convert_basic, convert_markdown_with_metadata,
            separate_paragraphs_single_line,
            separate_paragraphs_print_formatted, preserve_spaces,
            detect_paragraph_type, detect_formatting_type,
            normalize_line_endings, convert_textile, remove_indents,
            block_to_single_line, separate_hard_scene_breaks)

        self.log = log
        txt = b''
        log.debug('Reading text from file...')
        length = 0
        base_dir = self.output_dir = os.getcwd()

        # Extract content from zip archive.
        if file_ext == 'txtz':
            zf = ZipFile(stream)
            zf.extractall('.')

            for root, _, fnames in os.walk('.'):
                for x in fnames:
                    x = os.path.join(root, x)
                    if os.path.splitext(x)[1].lower() in ('.txt', '.text'):
                        with open(x, 'rb') as tf:
                            txt += tf.read() + b'\n\n'
        else:
            if getattr(stream, 'name', None):
                base_dir = os.path.dirname(stream.name)
            txt = stream.read()
            if file_ext in {'md', 'textile', 'markdown'}:
                options.formatting_type = {
                    'md': 'markdown'
                }.get(file_ext, file_ext)
                log.info(
                    'File extension indicates particular formatting. '
                    'Forcing formatting type to: %s', options.formatting_type)
                options.paragraph_type = 'off'

        # Get the encoding of the document.
        if options.input_encoding:
            ienc = options.input_encoding
            log.debug('Using user specified input encoding of %s', ienc)
        else:
            det_encoding = detect(txt[:4096])
            det_encoding, confidence = det_encoding['encoding'], det_encoding[
                'confidence']
            if det_encoding and det_encoding.lower().replace(
                    '_',
                    '-').strip() in ('gb2312', 'chinese', 'csiso58gb231280',
                                     'euc-cn', 'euccn', 'eucgb2312-cn',
                                     'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
                # Microsoft Word exports to HTML with encoding incorrectly set to
                # gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
                det_encoding = 'gbk'
            ienc = det_encoding
            log.debug(
                'Detected input encoding as %s with a confidence of '
                '%s%%', ienc, confidence * 100)
        if not ienc:
            ienc = 'utf-8'
            log.debug(
                'No input encoding specified and could not auto detect '
                'using %s', ienc)
        # Remove BOM from start of txt as its presence can confuse markdown
        import codecs
        for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8,
                    codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE):
            if txt.startswith(bom):
                txt = txt[len(bom):]
                break
        txt = txt.decode(ienc, 'replace')

        # Replace entities
        txt = entities.ENT_PAT.sub(entities.xml_entity_to_unicode, txt)

        # Normalize line endings
        txt = normalize_line_endings(txt)

        # Determine the paragraph type of the document.
        if options.paragraph_type == 'auto':
            options.paragraph_type = detect_paragraph_type(txt)
            if options.paragraph_type == 'unknown':
                log.debug('Could not reliably determine paragraph type using '
                          'block')
                options.paragraph_type = 'block'
            else:
                log.debug('Auto detected paragraph type as %s',
                          options.paragraph_type)

        # Detect formatting
        if options.formatting_type == 'auto':
            options.formatting_type = detect_formatting_type(txt)
            log.debug('Auto detected formatting as %s',
                      options.formatting_type)

        if options.formatting_type == 'heuristic':
            setattr(options, 'enable_heuristics', True)
            setattr(options, 'unwrap_lines', False)
            setattr(options, 'smarten_punctuation', True)

        # Reformat paragraphs to block formatting based on the detected type.
        # We don't check for block because the processor assumes block.
        # single and print at transformed to block for processing.
        if options.paragraph_type == 'single':
            txt = separate_paragraphs_single_line(txt)
        elif options.paragraph_type == 'print':
            txt = separate_hard_scene_breaks(txt)
            txt = separate_paragraphs_print_formatted(txt)
            txt = block_to_single_line(txt)
        elif options.paragraph_type == 'unformatted':
            from ebook_converter.ebooks.conversion.utils import HeuristicProcessor
            # unwrap lines based on punctuation
            docanalysis = DocAnalysis('txt', txt)
            length = docanalysis.line_length(.5)
            preprocessor = HeuristicProcessor(options,
                                              log=getattr(self, 'log', None))
            txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
            txt = separate_paragraphs_single_line(txt)
        elif options.paragraph_type == 'block':
            txt = separate_hard_scene_breaks(txt)
            txt = block_to_single_line(txt)

        if getattr(options, 'enable_heuristics', False) and getattr(
                options, 'dehyphenate', False):
            docanalysis = DocAnalysis('txt', txt)
            if not length:
                length = docanalysis.line_length(.5)
            dehyphenator = Dehyphenator(options.verbose, log=self.log)
            txt = dehyphenator(txt, 'txt', length)

        # User requested transformation on the text.
        if options.txt_in_remove_indents:
            txt = remove_indents(txt)

        # Preserve spaces will replace multiple spaces to a space
        # followed by the &nbsp; entity.
        if options.preserve_spaces:
            txt = preserve_spaces(txt)

        # Process the text using the appropriate text processor.
        self.shifted_files = []
        try:
            html = ''
            input_mi = None
            if options.formatting_type == 'markdown':
                log.debug('Running text through markdown conversion...')
                try:
                    input_mi, html = convert_markdown_with_metadata(
                        txt,
                        extensions=[
                            x.strip()
                            for x in options.markdown_extensions.split(',')
                            if x.strip()
                        ])
                except RuntimeError:
                    raise ValueError(
                        'This txt file has malformed markup, it cannot be'
                        ' converted by calibre. See https://daringfireball.net/projects/markdown/syntax'
                    )
                html = self.fix_resources(html, base_dir)
            elif options.formatting_type == 'textile':
                log.debug('Running text through textile conversion...')
                html = convert_textile(txt)
                html = self.fix_resources(html, base_dir)
            else:
                log.debug('Running text through basic conversion...')
                flow_size = getattr(options, 'flow_size', 0)
                html = convert_basic(txt, epub_split_size_kb=flow_size)

            # Run the HTMLized text through the html processing plugin.
            from ebook_converter.customize.ui import plugin_for_input_format
            html_input = plugin_for_input_format('html')
            for opt in html_input.options:
                setattr(options, opt.option.name, opt.recommended_value)
            options.input_encoding = 'utf-8'
            htmlfile = self.shift_file('index.html', html.encode('utf-8'))
            odi = options.debug_pipeline
            options.debug_pipeline = None
            # Generate oeb from html conversion.
            oeb = html_input.convert(open(htmlfile, 'rb'), options, 'html',
                                     log, {})
            options.debug_pipeline = odi
        finally:
            for x in self.shifted_files:
                os.remove(x)

        # Set metadata from file.
        if input_mi is None:
            from ebook_converter.customize.ui import get_file_type_metadata
            input_mi = get_file_type_metadata(stream, file_ext)
        from ebook_converter.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
        meta_info_to_oeb_metadata(input_mi, oeb.metadata, log)
        self.html_postprocess_title = input_mi.title

        return oeb

    def postprocess_book(self, oeb, opts, log):
        for item in oeb.spine:
            if hasattr(item.data, 'xpath'):
                for title in item.data.xpath('//*[local-name()="title"]'):
                    if title.text == 'Unknown':
                        title.text = self.html_postprocess_title
Ejemplo n.º 9
0
class SNBOutput(OutputFormatPlugin):

    name = 'SNB Output'
    author = 'Li Fanxi'
    file_type = 'snb'
    commit_name = 'snb_output'

    options = {
        OptionRecommendation(
            name='snb_output_encoding',
            recommended_value='utf-8',
            level=OptionRecommendation.LOW,
            help='Specify the character encoding of the output document. '
            'The default is utf-8.'),
        OptionRecommendation(
            name='snb_max_line_length',
            recommended_value=0,
            level=OptionRecommendation.LOW,
            help='The maximum number of characters per line. This splits on '
            'the first space before the specified value. If no space is '
            'found the line will be broken at the space after and will '
            'exceed the specified value. Also, there is a minimum of 25 '
            'characters. Use 0 to disable line splitting.'),
        OptionRecommendation(
            name='snb_insert_empty_line',
            recommended_value=False,
            level=OptionRecommendation.LOW,
            help='Specify whether or not to insert an empty line between two '
            'paragraphs.'),
        OptionRecommendation(
            name='snb_dont_indent_first_line',
            recommended_value=False,
            level=OptionRecommendation.LOW,
            help='Specify whether or not to insert two space characters to '
            'indent the first line of each paragraph.'),
        OptionRecommendation(
            name='snb_hide_chapter_name',
            recommended_value=False,
            level=OptionRecommendation.LOW,
            help='Specify whether or not to hide the chapter title for each '
            'chapter. Useful for image-only output (eg. comics).'),
        OptionRecommendation(
            name='snb_full_screen',
            recommended_value=False,
            level=OptionRecommendation.LOW,
            help='Resize all the images for full screen view. '),
    }

    def convert(self, oeb_book, output_path, input_plugin, opts, log):
        from lxml import etree
        from ebook_converter.ebooks.snb.snbfile import SNBFile
        from ebook_converter.ebooks.snb.snbml import SNBMLizer, ProcessFileName

        self.opts = opts
        from ebook_converter.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable
        try:
            rasterizer = SVGRasterizer()
            rasterizer(oeb_book, opts)
        except Unavailable:
            log.warn('SVG rasterizer unavailable, SVG will not be converted')

        # Create temp dir
        with TemporaryDirectory('_snb_output') as tdir:
            # Create stub directories
            snbfDir = os.path.join(tdir, 'snbf')
            snbcDir = os.path.join(tdir, 'snbc')
            snbiDir = os.path.join(tdir, 'snbc/images')
            os.mkdir(snbfDir)
            os.mkdir(snbcDir)
            os.mkdir(snbiDir)

            # Process Meta data
            meta = oeb_book.metadata
            if meta.title:
                title = str(meta.title[0])
            else:
                title = ''
            authors = [str(x) for x in meta.creator if x.role == 'aut']
            if meta.publisher:
                publishers = str(meta.publisher[0])
            else:
                publishers = ''
            if meta.language:
                lang = str(meta.language[0]).upper()
            else:
                lang = ''
            if meta.description:
                abstract = str(meta.description[0])
            else:
                abstract = ''

            # Process Cover
            g, m, s = oeb_book.guide, oeb_book.manifest, oeb_book.spine
            href = None
            if 'titlepage' not in g:
                if 'cover' in g:
                    href = g['cover'].href

            # Output book info file
            bookInfoTree = etree.Element("book-snbf", version="1.0")
            headTree = etree.SubElement(bookInfoTree, "head")
            etree.SubElement(headTree, "name").text = title
            etree.SubElement(headTree, "author").text = ' '.join(authors)
            etree.SubElement(headTree, "language").text = lang
            etree.SubElement(headTree, "rights")
            etree.SubElement(headTree, "publisher").text = publishers
            etree.SubElement(
                headTree, "generator").text = __appname__ + ' ' + __version__
            etree.SubElement(headTree, "created")
            etree.SubElement(headTree, "abstract").text = abstract
            if href is not None:
                etree.SubElement(headTree,
                                 "cover").text = ProcessFileName(href)
            else:
                etree.SubElement(headTree, "cover")
            with open(os.path.join(snbfDir, 'book.snbf'), 'wb') as f:
                f.write(
                    etree.tostring(bookInfoTree,
                                   pretty_print=True,
                                   encoding='utf-8'))

            # Output TOC
            tocInfoTree = etree.Element("toc-snbf")
            tocHead = etree.SubElement(tocInfoTree, "head")
            tocBody = etree.SubElement(tocInfoTree, "body")
            outputFiles = {}
            if oeb_book.toc.count() == 0:
                log.warn('This SNB file has no Table of Contents. '
                         'Creating a default TOC')
                first = next(iter(oeb_book.spine))
                oeb_book.toc.add('Start page', first.href)
            else:
                first = next(iter(oeb_book.spine))
                if oeb_book.toc[0].href != first.href:
                    # The pages before the fist item in toc will be stored as
                    # "Cover Pages".
                    # oeb_book.toc does not support "insert", so we generate
                    # the tocInfoTree directly instead of modifying the toc
                    ch = etree.SubElement(tocBody, "chapter")
                    ch.set("src", ProcessFileName(first.href) + ".snbc")
                    ch.text = 'Cover pages'
                    outputFiles[first.href] = []
                    outputFiles[first.href].append(("", "Cover pages"))

            for tocitem in oeb_book.toc:
                if tocitem.href.find('#') != -1:
                    item = tocitem.href.split('#')
                    if len(item) != 2:
                        log.error('Error in TOC item: %s' % tocitem)
                    else:
                        if item[0] in outputFiles:
                            outputFiles[item[0]].append(
                                (item[1], tocitem.title))
                        else:
                            outputFiles[item[0]] = []
                            if "" not in outputFiles[item[0]]:
                                outputFiles[item[0]].append(
                                    ("", tocitem.title + " (Preface)"))
                                ch = etree.SubElement(tocBody, "chapter")
                                ch.set("src",
                                       ProcessFileName(item[0]) + ".snbc")
                                ch.text = tocitem.title + " (Preface)"
                            outputFiles[item[0]].append(
                                (item[1], tocitem.title))
                else:
                    if tocitem.href in outputFiles:
                        outputFiles[tocitem.href].append(("", tocitem.title))
                    else:
                        outputFiles[tocitem.href] = []
                        outputFiles[tocitem.href].append(("", tocitem.title))
                ch = etree.SubElement(tocBody, "chapter")
                ch.set("src", ProcessFileName(tocitem.href) + ".snbc")
                ch.text = tocitem.title

            etree.SubElement(tocHead, "chapters").text = '%d' % len(tocBody)

            with open(os.path.join(snbfDir, 'toc.snbf'), 'wb') as f:
                f.write(
                    etree.tostring(tocInfoTree,
                                   pretty_print=True,
                                   encoding='utf-8'))

            # Output Files
            oldTree = None
            mergeLast = False
            lastName = None
            for item in s:
                from ebook_converter.ebooks.oeb.base import OEB_DOCS, OEB_IMAGES
                if m.hrefs[item.href].media_type in OEB_DOCS:
                    if item.href not in outputFiles:
                        log.debug(
                            'File %s is unused in TOC. Continue in last chapter'
                            % item.href)
                        mergeLast = True
                    else:
                        if oldTree is not None and mergeLast:
                            log.debug('Output the modified chapter again: %s' %
                                      lastName)
                            with open(os.path.join(snbcDir, lastName),
                                      'wb') as f:
                                f.write(
                                    etree.tostring(oldTree,
                                                   pretty_print=True,
                                                   encoding='utf-8'))
                            mergeLast = False

                    log.debug('Converting %s to snbc...' % item.href)
                    snbwriter = SNBMLizer(log)
                    snbcTrees = None
                    if not mergeLast:
                        snbcTrees = snbwriter.extract_content(
                            oeb_book, item, outputFiles[item.href], opts)
                        for subName in snbcTrees:
                            postfix = ''
                            if subName != '':
                                postfix = '_' + subName
                            lastName = ProcessFileName(item.href + postfix +
                                                       ".snbc")
                            oldTree = snbcTrees[subName]
                            with open(os.path.join(snbcDir, lastName),
                                      'wb') as f:
                                f.write(
                                    etree.tostring(oldTree,
                                                   pretty_print=True,
                                                   encoding='utf-8'))
                    else:
                        log.debug('Merge %s with last TOC item...' % item.href)
                        snbwriter.merge_content(oldTree, oeb_book, item,
                                                [('', "Start")], opts)

            # Output the last one if needed
            log.debug('Output the last modified chapter again: %s' % lastName)
            if oldTree is not None and mergeLast:
                with open(os.path.join(snbcDir, lastName), 'wb') as f:
                    f.write(
                        etree.tostring(oldTree,
                                       pretty_print=True,
                                       encoding='utf-8'))
                mergeLast = False

            for item in m:
                if m.hrefs[item.href].media_type in OEB_IMAGES:
                    log.debug('Converting image: %s ...' % item.href)
                    content = m.hrefs[item.href].data
                    # Convert & Resize image
                    self.HandleImage(
                        content,
                        os.path.join(snbiDir, ProcessFileName(item.href)))

            # Package as SNB File
            snbFile = SNBFile()
            snbFile.FromDir(tdir)
            snbFile.Output(output_path)

    def HandleImage(self, imageData, imagePath):
        from ebook_converter.utils.img import image_from_data, resize_image, image_to_data
        img = image_from_data(imageData)
        x, y = img.width(), img.height()
        if self.opts:
            if self.opts.snb_full_screen:
                SCREEN_X, SCREEN_Y = self.opts.output_profile.screen_size
            else:
                SCREEN_X, SCREEN_Y = self.opts.output_profile.comic_screen_size
        else:
            SCREEN_X = 540
            SCREEN_Y = 700
        # Handle big image only
        if x > SCREEN_X or y > SCREEN_Y:
            xScale = float(x) / SCREEN_X
            yScale = float(y) / SCREEN_Y
            scale = max(xScale, yScale)
            # TODO : intelligent image rotation
            #     img = img.rotate(90)
            #     x,y = y,x
            img = resize_image(img, x // scale, y // scale)
        with open(imagePath, 'wb') as f:
            f.write(image_to_data(img, fmt=imagePath.rpartition('.')[-1]))
Ejemplo n.º 10
0
class EPUBOutput(OutputFormatPlugin):

    name = 'EPUB Output'
    author = 'Kovid Goyal'
    file_type = 'epub'
    commit_name = 'epub_output'
    ui_data = {'versions': ('2', '3')}

    options = {
        OptionRecommendation(name='extract_to',
            help='Extract the contents of the generated %s file to the '
                 'specified directory. The contents of the directory are '
                 'first deleted, so be careful.' % 'EPUB'),

        OptionRecommendation(name='dont_split_on_page_breaks',
            recommended_value=False, level=OptionRecommendation.LOW,
            help='Turn off splitting at page breaks. Normally, input '
                 'files are automatically split at every page break into '
                 'two files. This gives an output e-book that can be '
                 'parsed faster and with less resources. However, '
                 'splitting is slow and if your source file contains a '
                 'very large number of page breaks, you should turn off '
                 'splitting on page breaks.'
        ),

        OptionRecommendation(name='flow_size', recommended_value=260,
            help='Split all HTML files larger than this size (in KB). '
                 'This is necessary as most EPUB readers cannot handle large '
                 'file sizes. The default of %defaultKB is the size required '
                 'for Adobe Digital Editions. Set to 0 to disable size based '
                 'splitting.'
        ),

        OptionRecommendation(name='no_default_epub_cover', recommended_value=False,
            help='Normally, if the input file has no cover and you don\'t'
            ' specify one, a default cover is generated with the title, '
            'authors, etc. This option disables the generation of this cover.'
        ),

        OptionRecommendation(name='no_svg_cover', recommended_value=False,
            help='Do not use SVG for the book cover. Use this option if '
                'your EPUB is going to be used on a device that does not '
                'support SVG, like the iPhone or the JetBook Lite. '
                'Without this option, such devices will display the cover '
                'as a blank page.'
        ),

        OptionRecommendation(name='preserve_cover_aspect_ratio',
            recommended_value=False,
            help='When using an SVG cover, this option will cause the cover '
                 'to scale to cover the available screen area, but still '
                 'preserve its aspect ratio (ratio of width to height). That '
                 'means there may be white borders at the sides or top and '
                 'bottom of the image, but the image will never be distorted. '
                 'Without this option the image may be slightly distorted, '
                 'but there will be no borders.'
        ),

        OptionRecommendation(name='epub_flatten', recommended_value=False,
            help='This option is needed only if you intend to use the EPUB'
                 ' with FBReaderJ. It will flatten the file system inside the'
                 ' EPUB, putting all files into the top level.'
        ),

        OptionRecommendation(name='epub_inline_toc', recommended_value=False,
            help='Insert an inline Table of Contents that will appear as part '
                 'of the main book content.'
        ),

        OptionRecommendation(name='epub_toc_at_end', recommended_value=False,
            help='Put the inserted inline Table of Contents at the end of '
                 'the book instead of the start.'
        ),

        OptionRecommendation(name='toc_title', recommended_value=None,
            help='Title for any generated in-line table of contents.'
        ),

        OptionRecommendation(name='epub_version', recommended_value='2', choices=ui_data['versions'],
            help='The version of the EPUB file to generate. EPUB 2 is the '
                 'most widely compatible, only use EPUB 3 if you know you '
                 'actually need it.'
        )
        }

    recommendations = {('pretty_print', True, OptionRecommendation.HIGH)}

    def workaround_webkit_quirks(self):  # {{{
        for x in self.oeb.spine:
            root = x.data
            body = base.XPath('//h:body')(root)
            if body:
                body = body[0]

            if not hasattr(body, 'xpath'):
                continue

            for pre in base.XPath('//h:pre')(body):
                if not pre.text and len(pre) == 0:
                    pre.tag = 'div'
    # }}}

    def upshift_markup(self):  # {{{
        'Upgrade markup to comply with XHTML 1.1 where possible'
        for x in self.oeb.spine:
            root = x.data
            if (not root.get(base.tag('xml', 'lang'))) and (root.get('lang')):
                root.set(base.tag('xml', 'lang'), root.get('lang'))
            body = base.XPath('//h:body')(root)
            if body:
                body = body[0]

            if not hasattr(body, 'xpath'):
                continue
            for u in base.XPath('//h:u')(root):
                u.tag = 'span'

            seen_ids, seen_names = set(), set()
            for x in base.XPath('//*[@id or @name]')(root):
                eid, name = x.get('id', None), x.get('name', None)
                if eid:
                    if eid in seen_ids:
                        del x.attrib['id']
                    else:
                        seen_ids.add(eid)
                if name:
                    if name in seen_names:
                        del x.attrib['name']
                    else:
                        seen_names.add(name)

    # }}}

    def convert(self, oeb, output_path, input_plugin, opts, log):
        self.log, self.opts, self.oeb = log, opts, oeb

        if self.opts.epub_inline_toc:
            from ebook_converter.ebooks.mobi.writer8.toc import TOCAdder
            opts.mobi_toc_at_start = not opts.epub_toc_at_end
            opts.mobi_passthrough = False
            opts.no_inline_toc = False
            TOCAdder(oeb, opts, replace_previous_inline_toc=True, ignore_existing_toc=True)

        if self.opts.epub_flatten:
            from ebook_converter.ebooks.oeb.transforms.filenames import FlatFilenames
            FlatFilenames()(oeb, opts)
        else:
            from ebook_converter.ebooks.oeb.transforms.filenames import UniqueFilenames
            UniqueFilenames()(oeb, opts)

        self.workaround_ade_quirks()
        self.workaround_webkit_quirks()
        self.upshift_markup()
        from ebook_converter.ebooks.oeb.transforms.rescale import RescaleImages
        RescaleImages(check_colorspaces=True)(oeb, opts)

        from ebook_converter.ebooks.oeb.transforms.split import Split
        split = Split(not self.opts.dont_split_on_page_breaks,
                max_flow_size=self.opts.flow_size*1024
                )
        split(self.oeb, self.opts)

        from ebook_converter.ebooks.oeb.transforms.cover import CoverManager
        cm = CoverManager(
                no_default_cover=self.opts.no_default_epub_cover,
                no_svg_cover=self.opts.no_svg_cover,
                preserve_aspect_ratio=self.opts.preserve_cover_aspect_ratio)
        cm(self.oeb, self.opts, self.log)

        self.workaround_sony_quirks()

        if self.oeb.toc.count() == 0:
            self.log.warn('This EPUB file has no Table of Contents. '
                    'Creating a default TOC')
            first = next(iter(self.oeb.spine))
            self.oeb.toc.add('Start', first.href)

        identifiers = oeb.metadata['identifier']
        _uuid = None
        for x in identifiers:
            if (x.get(base.tag('opf', 'scheme'), None).lower() == 'uuid' or
                    str(x).startswith('urn:uuid:')):
                _uuid = str(x).split(':')[-1]
                break
        encrypted_fonts = getattr(input_plugin, 'encrypted_fonts', [])

        if _uuid is None:
            self.log.warn('No UUID identifier found')
            _uuid = str(uuid.uuid4())
            oeb.metadata.add('identifier', _uuid, scheme='uuid', id=_uuid)

        if encrypted_fonts and not _uuid.startswith('urn:uuid:'):
            # Apparently ADE requires this value to start with urn:uuid:
            # for some absurd reason, or it will throw a hissy fit and refuse
            # to use the obfuscated fonts.
            for x in identifiers:
                if str(x) == _uuid:
                    x.content = 'urn:uuid:' + _uuid

        with TemporaryDirectory('_epub_output') as tdir:
            from ebook_converter.customize.ui import plugin_for_output_format
            metadata_xml = None
            extra_entries = []
            if self.is_periodical:
                if self.opts.output_profile.epub_periodical_format == 'sony':
                    from ebook_converter.ebooks.epub.periodical import sony_metadata
                    metadata_xml, atom_xml = sony_metadata(oeb)
                    extra_entries = [('atom.xml', 'application/atom+xml', atom_xml)]
            oeb_output = plugin_for_output_format('oeb')
            oeb_output.convert(oeb, tdir, input_plugin, opts, log)
            opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0]
            self.condense_ncx([os.path.join(tdir, x) for x in os.listdir(tdir)
                    if x.endswith('.ncx')][0])
            if self.opts.epub_version == '3':
                self.upgrade_to_epub3(tdir, opf)
            encryption = None
            if encrypted_fonts:
                encryption = self.encrypt_fonts(encrypted_fonts, tdir, _uuid)

            from ebook_converter.ebooks.epub import initialize_container
            with initialize_container(output_path, os.path.basename(opf),
                    extra_entries=extra_entries) as epub:
                epub.add_dir(tdir)
                if encryption is not None:
                    epub.writestr('META-INF/encryption.xml', as_bytes(encryption))
                if metadata_xml is not None:
                    epub.writestr('META-INF/metadata.xml',
                            metadata_xml.encode('utf-8'))
            if opts.extract_to is not None:
                from ebook_converter.utils.zipfile import ZipFile
                if os.path.exists(opts.extract_to):
                    if os.path.isdir(opts.extract_to):
                        shutil.rmtree(opts.extract_to)
                    else:
                        os.remove(opts.extract_to)
                os.mkdir(opts.extract_to)
                with ZipFile(output_path) as zf:
                    zf.extractall(path=opts.extract_to)
                self.log.info('EPUB extracted to', opts.extract_to)

    def upgrade_to_epub3(self, tdir, opf):
        self.log.info('Upgrading to EPUB 3...')
        from ebook_converter.ebooks.epub import simple_container_xml
        from ebook_converter.ebooks.oeb.polish.cover import fix_conversion_titlepage_links_in_nav
        try:
            os.mkdir(os.path.join(tdir, 'META-INF'))
        except EnvironmentError:
            pass
        with open(os.path.join(tdir, 'META-INF', 'container.xml'), 'wb') as f:
            f.write(simple_container_xml(os.path.basename(opf)).encode('utf-8'))
        from ebook_converter.ebooks.oeb.polish.container import EpubContainer
        container = EpubContainer(tdir, self.log)
        from ebook_converter.ebooks.oeb.polish.upgrade import epub_2_to_3
        existing_nav = getattr(self.opts, 'epub3_nav_parsed', None)
        nav_href = getattr(self.opts, 'epub3_nav_href', None)
        previous_nav = (nav_href, existing_nav) if existing_nav and nav_href else None
        epub_2_to_3(container, self.log.info, previous_nav=previous_nav)
        fix_conversion_titlepage_links_in_nav(container)
        container.commit()
        os.remove(f.name)
        try:
            os.rmdir(os.path.join(tdir, 'META-INF'))
        except EnvironmentError:
            pass

    def encrypt_fonts(self, uris, tdir, _uuid):  # {{{
        from ebook_converter.polyglot.binary import from_hex_bytes

        key = re.sub(r'[^a-fA-F0-9]', '', _uuid)
        if len(key) < 16:
            raise ValueError('UUID identifier %r is invalid'% _uuid)
        key = bytearray(from_hex_bytes((key + key)[:32]))
        paths = []
        with CurrentDir(tdir):
            paths = [os.path.join(*x.split('/')) for x in uris]
            uris = dict(zip(uris, paths))
            fonts = []
            for uri in list(uris.keys()):
                path = uris[uri]
                if not os.path.exists(path):
                    uris.pop(uri)
                    continue
                self.log.debug('Encrypting font:', uri)
                with open(path, 'r+b') as f:
                    data = f.read(1024)
                    if len(data) >= 1024:
                        data = bytearray(data)
                        f.seek(0)
                        f.write(bytes(bytearray(data[i] ^ key[i%16]
                                                for i in range(1024))))
                    else:
                        self.log.warn('Font', path, 'is invalid, ignoring')
                if not isinstance(uri, str):
                    uri = uri.decode('utf-8')
                fonts.append('''
                <enc:EncryptedData>
                    <enc:EncryptionMethod Algorithm="http://ns.adobe.com/pdf/enc#RC"/>
                    <enc:CipherData>
                    <enc:CipherReference URI="%s"/>
                    </enc:CipherData>
                </enc:EncryptedData>
                '''%(uri.replace('"', '\\"')))
            if fonts:
                ans = '''<encryption
                    xmlns="urn:oasis:names:tc:opendocument:xmlns:container"
                    xmlns:enc="http://www.w3.org/2001/04/xmlenc#"
                    xmlns:deenc="http://ns.adobe.com/digitaleditions/enc">
                    '''
                ans += '\n'.join(fonts)
                ans += '\n</encryption>'
                return ans
    # }}}

    def condense_ncx(self, ncx_path):  # {{{
        from lxml import etree
        if not self.opts.pretty_print:
            tree = etree.parse(ncx_path)
            for tag in tree.getroot().iter(tag=etree.Element):
                if tag.text:
                    tag.text = tag.text.strip()
                if tag.tail:
                    tag.tail = tag.tail.strip()
            compressed = etree.tostring(tree.getroot(), encoding='utf-8')
            with open(ncx_path, 'wb') as f:
                f.write(compressed)
    # }}}

    def workaround_ade_quirks(self):  # {{{
        """
        Perform various markup transforms to get the output to render correctly
        in the quirky ADE.
        """

        stylesheet = self.oeb.manifest.main_stylesheet

        # ADE cries big wet tears when it encounters an invalid fragment
        # identifier in the NCX toc.
        frag_pat = re.compile(r'[-A-Za-z0-9_:.]+$')
        for node in self.oeb.toc.iter():
            href = getattr(node, 'href', None)
            if hasattr(href, 'partition'):
                _base, _, frag = href.partition('#')
                frag = urllib.parse.unquote(frag)
                if frag and frag_pat.match(frag) is None:
                    self.log.warn(
                            'Removing fragment identifier %r from TOC as Adobe Digital Editions cannot handle it'%frag)
                    node.href = _base

        for x in self.oeb.spine:
            root = x.data
            body = base.XPath('//h:body')(root)
            if body:
                body = body[0]

            if hasattr(body, 'xpath'):
                # remove <img> tags with empty src elements
                bad = []
                for x in base.XPath('//h:img')(body):
                    src = x.get('src', '').strip()
                    if src in ('', '#') or src.startswith('http:'):
                        bad.append(x)
                for img in bad:
                    img.getparent().remove(img)

                # Add id attribute to <a> tags that have name
                for x in base.XPath('//h:a[@name]')(body):
                    if not x.get('id', False):
                        x.set('id', x.get('name'))
                    # The delightful epubcheck has started complaining about <a> tags that
                    # have name attributes.
                    x.attrib.pop('name')

                # Replace <br> that are children of <body> as ADE doesn't handle them
                for br in base.XPath('./h:br')(body):
                    if br.getparent() is None:
                        continue
                    try:
                        prior = next(br.itersiblings(preceding=True))
                        priortag = parse_utils.barename(prior.tag)
                        priortext = prior.tail
                    except:
                        priortag = 'body'
                        priortext = body.text
                    if priortext:
                        priortext = priortext.strip()
                    br.tag = base.tag('xhtml', 'p')
                    br.text = '\u00a0'
                    style = br.get('style', '').split(';')
                    style = list(filter(None, map(lambda x: x.strip(), style)))
                    style.append('margin:0pt; border:0pt')
                    # If the prior tag is a block (including a <br> we replaced)
                    # then this <br> replacement should have a 1-line height.
                    # Otherwise it should have no height.
                    if not priortext and priortag in block_level_tags:
                        style.append('height:1em')
                    else:
                        style.append('height:0pt')
                    br.set('style', '; '.join(style))

            for tag in base.XPath('//h:embed')(root):
                tag.getparent().remove(tag)
            for tag in base.XPath('//h:object')(root):
                if tag.get('type', '').lower().strip() in {'image/svg+xml', 'application/svg+xml'}:
                    continue
                tag.getparent().remove(tag)

            for tag in base.XPath('//h:title|//h:style')(root):
                if not tag.text:
                    tag.getparent().remove(tag)
            for tag in base.XPath('//h:script')(root):
                if (not tag.text and not tag.get('src', False) and tag.get('type', None) != 'text/x-mathjax-config'):
                    tag.getparent().remove(tag)
            for tag in base.XPath('//h:body/descendant::h:script')(root):
                tag.getparent().remove(tag)

            formchildren = base.XPath('./h:input|./h:button|./h:textarea|'
                    './h:label|./h:fieldset|./h:legend')
            for tag in base.XPath('//h:form')(root):
                if formchildren(tag):
                    tag.getparent().remove(tag)
                else:
                    # Not a real form
                    tag.tag = base.tag('xhtml', 'div')

            for tag in base.XPath('//h:center')(root):
                tag.tag = base.tag('xhtml', 'div')
                tag.set('style', 'text-align:center')
            # ADE can't handle &amp; in an img url
            for tag in base.XPath('//h:img[@src]')(root):
                tag.set('src', tag.get('src', '').replace('&', ''))

            # ADE whimpers in fright when it encounters a <td> outside a
            # <table>
            in_table = base.XPath('ancestor::h:table')
            for tag in base.XPath('//h:td|//h:tr|//h:th')(root):
                if not in_table(tag):
                    tag.tag = base.tag('xhtml', 'div')

            # ADE fails to render non breaking hyphens/soft hyphens/zero width spaces
            special_chars = re.compile('[\u200b\u00ad]')
            for elem in root.iterdescendants('*'):
                if elem.text:
                    elem.text = special_chars.sub('', elem.text)
                    elem.text = elem.text.replace('\u2011', '-')
                if elem.tail:
                    elem.tail = special_chars.sub('', elem.tail)
                    elem.tail = elem.tail.replace('\u2011', '-')

            if stylesheet is not None:
                # ADE doesn't render lists correctly if they have left margins
                from css_parser.css import CSSRule
                for lb in base.XPath('//h:ul[@class]|//h:ol[@class]')(root):
                    sel = '.'+lb.get('class')
                    for rule in stylesheet.data.cssRules.rulesOfType(CSSRule.STYLE_RULE):
                        if sel == rule.selectorList.selectorText:
                            rule.style.removeProperty('margin-left')
                            # padding-left breaks rendering in webkit and gecko
                            rule.style.removeProperty('padding-left')
                # Change whitespace:pre to pre-wrap to accommodate readers that
                # cannot scroll horizontally
                for rule in stylesheet.data.cssRules.rulesOfType(CSSRule.STYLE_RULE):
                    style = rule.style
                    ws = style.getPropertyValue('white-space')
                    if ws == 'pre':
                        style.setProperty('white-space', 'pre-wrap')

    # }}}

    def workaround_sony_quirks(self):  # {{{
        '''
        Perform toc link transforms to alleviate slow loading.
        '''
        from ebook_converter.ebooks.oeb.polish.toc import item_at_top

        def frag_is_at_top(root, frag):
            elem = base.XPath('//*[@id="%s" or @name="%s"]'%(frag, frag))(root)
            if elem:
                elem = elem[0]
            else:
                return False
            return item_at_top(elem)

        def simplify_toc_entry(toc):
            if toc.href:
                href, frag = urllib.parse.urldefrag(toc.href)
                if frag:
                    for x in self.oeb.spine:
                        if x.href == href:
                            if frag_is_at_top(x.data, frag):
                                self.log.debug('Removing anchor from TOC href:',
                                        href+'#'+frag)
                                toc.href = href
                            break
            for x in toc:
                simplify_toc_entry(x)

        if self.oeb.toc:
            simplify_toc_entry(self.oeb.toc)
Ejemplo n.º 11
0
class RecipeInput(InputFormatPlugin):

    name        = 'Recipe Input'
    author      = 'Kovid Goyal'
    description = 'Download periodical content from the internet'
    file_types  = {'recipe', 'downloaded_recipe'}
    commit_name = 'recipe_input'

    recommendations = {
        ('chapter', None, OptionRecommendation.HIGH),
        ('dont_split_on_page_breaks', True, OptionRecommendation.HIGH),
        ('use_auto_toc', False, OptionRecommendation.HIGH),
        ('input_encoding', None, OptionRecommendation.HIGH),
        ('input_profile', 'default', OptionRecommendation.HIGH),
        ('page_breaks_before', None, OptionRecommendation.HIGH),
        ('insert_metadata', False, OptionRecommendation.HIGH),
        }

    options = {
        OptionRecommendation(name='test', recommended_value=False,
            help='Useful for recipe development. Forces max_articles_per_feed '
                 'to 2 and downloads at most 2 feeds. You can change the '
                 'number of feeds and articles by supplying optional '
                 'arguments. For example: --test 3 1 will download at most 3 '
                 'feeds and only 1 article per feed.'),
        OptionRecommendation(name='username', recommended_value=None,
            help='Username for sites that require a login to access content.'),
        OptionRecommendation(name='password', recommended_value=None,
            help='Password for sites that require a login to access content.'),
        OptionRecommendation(name='dont_download_recipe',
            recommended_value=False,
            help='Do not download latest version of builtin recipes from the '
                 'calibre server'),
        OptionRecommendation(name='lrf', recommended_value=False,
            help='Optimize fetching for subsequent conversion to LRF.'),
        }

    def convert(self, recipe_or_file, opts, file_ext, log,
            accelerators):
        from ebook_converter.web.feeds.recipes import compile_recipe
        opts.output_profile.flow_size = 0
        if file_ext == 'downloaded_recipe':
            from ebook_converter.utils.zipfile import ZipFile
            zf = ZipFile(recipe_or_file, 'r')
            zf.extractall()
            zf.close()
            with open('download.recipe', 'rb') as f:
                self.recipe_source = f.read()
            recipe = compile_recipe(self.recipe_source)
            recipe.needs_subscription = False
            self.recipe_object = recipe(opts, log, self.report_progress)
        else:
            if os.environ.get('CALIBRE_RECIPE_URN'):
                from ebook_converter.web.feeds.recipes.collection import get_custom_recipe, get_builtin_recipe_by_id
                urn = os.environ['CALIBRE_RECIPE_URN']
                log('Downloading recipe urn: ' + urn)
                rtype, recipe_id = urn.partition(':')[::2]
                if not recipe_id:
                    raise ValueError('Invalid recipe urn: ' + urn)
                if rtype == 'custom':
                    self.recipe_source = get_custom_recipe(recipe_id)
                else:
                    self.recipe_source = get_builtin_recipe_by_id(urn, log=log, download_recipe=True)
                if not self.recipe_source:
                    raise ValueError('Could not find recipe with urn: ' + urn)
                if not isinstance(self.recipe_source, bytes):
                    self.recipe_source = self.recipe_source.encode('utf-8')
                recipe = compile_recipe(self.recipe_source)
            elif os.access(recipe_or_file, os.R_OK):
                with open(recipe_or_file, 'rb') as f:
                    self.recipe_source = f.read()
                recipe = compile_recipe(self.recipe_source)
                log('Using custom recipe')
            else:
                from ebook_converter.web.feeds.recipes.collection import (
                        get_builtin_recipe_by_title, get_builtin_recipe_titles)
                title = getattr(opts, 'original_recipe_input_arg', recipe_or_file)
                title = os.path.basename(title).rpartition('.')[0]
                titles = frozenset(get_builtin_recipe_titles())
                if title not in titles:
                    title = getattr(opts, 'original_recipe_input_arg', recipe_or_file)
                    title = title.rpartition('.')[0]

                raw = get_builtin_recipe_by_title(title, log=log,
                        download_recipe=not opts.dont_download_recipe)
                builtin = False
                try:
                    recipe = compile_recipe(raw)
                    self.recipe_source = raw
                    if recipe.requires_version > numeric_version:
                        log.warn(
                        'Downloaded recipe needs calibre version at least: %s' %
                        ('.'.join(recipe.requires_version)))
                        builtin = True
                except:
                    log.exception('Failed to compile downloaded recipe. Falling '
                            'back to builtin one')
                    builtin = True
                if builtin:
                    log('Using bundled builtin recipe')
                    raw = get_builtin_recipe_by_title(title, log=log,
                            download_recipe=False)
                    if raw is None:
                        raise ValueError('Failed to find builtin recipe: '+title)
                    recipe = compile_recipe(raw)
                    self.recipe_source = raw
                else:
                    log('Using downloaded builtin recipe')

            if recipe is None:
                raise ValueError('%r is not a valid recipe file or builtin recipe' %
                        recipe_or_file)

            disabled = getattr(recipe, 'recipe_disabled', None)
            if disabled is not None:
                raise RecipeDisabled(disabled)
            ro = recipe(opts, log, self.report_progress)
            ro.download()
            self.recipe_object = ro

        for key, val in self.recipe_object.conversion_options.items():
            setattr(opts, key, val)

        for f in os.listdir('.'):
            if f.endswith('.opf'):
                return os.path.abspath(f)

        for f in walk('.'):
            if f.endswith('.opf'):
                return os.path.abspath(f)

    def postprocess_book(self, oeb, opts, log):
        if self.recipe_object is not None:
            self.recipe_object.internal_postprocess_book(oeb, opts, log)
            self.recipe_object.postprocess_book(oeb, opts, log)

    def specialize(self, oeb, opts, log, output_fmt):
        if opts.no_inline_navbars:
            from ebook_converter.ebooks.oeb.base import XPath
            for item in oeb.spine:
                for div in XPath('//h:div[contains(@class, "calibre_navbar")]')(item.data):
                    div.getparent().remove(div)

    def save_download(self, zf):
        raw = self.recipe_source
        if isinstance(raw, str):
            raw = raw.encode('utf-8')
        zf.writestr('download.recipe', raw)
Ejemplo n.º 12
0
class HTMLOutput(OutputFormatPlugin):

    name = 'HTML Output'
    author = 'Fabian Grassl'
    file_type = 'zip'
    commit_name = 'html_output'

    options = {
        OptionRecommendation(
            name='template_css',
            help='CSS file used for the output instead of the default file'),
        OptionRecommendation(
            name='template_html_index',
            help=
            'Template used for generation of the HTML index file instead of the default file'
        ),
        OptionRecommendation(
            name='template_html',
            help=
            'Template used for the generation of the HTML contents of the book instead of the default file'
        ),
        OptionRecommendation(
            name='extract_to',
            help='Extract the contents of the generated ZIP file to the '
            'specified directory. WARNING: The contents of the directory '
            'will be deleted.'),
    }

    recommendations = {('pretty_print', True, OptionRecommendation.HIGH)}

    def generate_toc(self, oeb_book, ref_url, output_dir):
        '''
        Generate table of contents
        '''

        with CurrentDir(output_dir):

            def build_node(current_node, parent=None):
                if parent is None:
                    parent = etree.Element('ul')
                elif len(current_node.nodes):
                    parent = element(parent, ('ul'))
                for node in current_node.nodes:
                    point = element(parent, 'li')
                    href = relpath(os.path.abspath(unquote(node.href)),
                                   os.path.dirname(ref_url))
                    if isinstance(href, bytes):
                        href = href.decode('utf-8')
                    link = element(point, 'a', href=clean_xml_chars(href))
                    title = node.title
                    if isinstance(title, bytes):
                        title = title.decode('utf-8')
                    if title:
                        title = re.sub(r'\s+', ' ', title)
                    link.text = clean_xml_chars(title)
                    build_node(node, point)
                return parent

            wrap = etree.Element('div')
            wrap.append(build_node(oeb_book.toc))
            return wrap

    def generate_html_toc(self, oeb_book, ref_url, output_dir):
        from lxml import etree

        root = self.generate_toc(oeb_book, ref_url, output_dir)
        return etree.tostring(root,
                              pretty_print=True,
                              encoding='unicode',
                              xml_declaration=False)

    def convert(self, oeb_book, output_path, input_plugin, opts, log):
        from lxml import etree
        from ebook_converter.utils import zipfile
        from templite import Templite
        from ebook_converter.polyglot.urllib import unquote
        from ebook_converter.ebooks.html.meta import EasyMeta

        # read template files
        if opts.template_html_index is not None:
            with open(opts.template_html_index, 'rb') as f:
                template_html_index_data = f.read()
        else:
            with open(
                    pkg_resources.resource_filename(
                        'ebook_converter',
                        'data/html_export_default_index.tmpl')) as fobj:
                template_html_index_data = fobj.read().decode()

        if opts.template_html is not None:
            with open(opts.template_html, 'rb') as f:
                template_html_data = f.read()
        else:
            with open(
                    pkg_resources.resource_filename(
                        'ebook_converter',
                        'data/html_export_default.tmpl')) as fobj:
                template_html_data = fobj.read().decode()

        if opts.template_css is not None:
            with open(opts.template_css, 'rb') as f:
                template_css_data = f.read()
        else:
            with open(
                    pkg_resources.resource_filename(
                        'ebook_converter',
                        'data/html_export_default.css')) as fobj:
                template_css_data = fobj.read().decode()

        template_html_index_data = template_html_index_data.decode('utf-8')
        template_html_data = template_html_data.decode('utf-8')
        template_css_data = template_css_data.decode('utf-8')

        self.log = log
        self.opts = opts
        meta = EasyMeta(oeb_book.metadata)

        tempdir = os.path.realpath(PersistentTemporaryDirectory())
        output_file = os.path.join(
            tempdir,
            os.path.basename(re.sub(r'\.zip', '', output_path) + '.html'))
        output_dir = re.sub(r'\.html', '', output_file) + '_files'

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        css_path = output_dir + os.sep + 'calibreHtmlOutBasicCss.css'
        with open(css_path, 'wb') as f:
            f.write(template_css_data.encode('utf-8'))

        with open(output_file, 'wb') as f:
            html_toc = self.generate_html_toc(oeb_book, output_file,
                                              output_dir)
            templite = Templite(template_html_index_data)
            nextLink = oeb_book.spine[0].href
            nextLink = relpath(output_dir + os.sep + nextLink,
                               os.path.dirname(output_file))
            cssLink = relpath(os.path.abspath(css_path),
                              os.path.dirname(output_file))
            tocUrl = relpath(output_file, os.path.dirname(output_file))
            t = templite.render(has_toc=bool(oeb_book.toc.count()),
                                toc=html_toc,
                                meta=meta,
                                nextLink=nextLink,
                                tocUrl=tocUrl,
                                cssLink=cssLink,
                                firstContentPageLink=nextLink)
            if isinstance(t, str):
                t = t.encode('utf-8')
            f.write(t)

        with CurrentDir(output_dir):
            for item in oeb_book.manifest:
                path = os.path.abspath(unquote(item.href))
                dir = os.path.dirname(path)
                if not os.path.exists(dir):
                    os.makedirs(dir)
                if item.spine_position is not None:
                    with open(path, 'wb') as f:
                        pass
                else:
                    with open(path, 'wb') as f:
                        f.write(item.bytes_representation)
                    item.unload_data_from_memory(memory=path)

            for item in oeb_book.spine:
                path = os.path.abspath(unquote(item.href))
                dir = os.path.dirname(path)
                root = item.data.getroottree()

                # get & clean HTML <HEAD>-data
                head = root.xpath(
                    '//h:head',
                    namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
                head_content = etree.tostring(head,
                                              pretty_print=True,
                                              encoding='unicode')
                head_content = re.sub(r'\<\/?head.*\>', '', head_content)
                head_content = re.sub(
                    re.compile(r'\<style.*\/style\>', re.M | re.S), '',
                    head_content)
                head_content = re.sub(r'<(title)([^>]*)/>', r'<\1\2></\1>',
                                      head_content)

                # get & clean HTML <BODY>-data
                body = root.xpath(
                    '//h:body',
                    namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
                ebook_content = etree.tostring(body,
                                               pretty_print=True,
                                               encoding='unicode')
                ebook_content = re.sub(r'\<\/?body.*\>', '', ebook_content)
                ebook_content = re.sub(r'<(div|a|span)([^>]*)/>',
                                       r'<\1\2></\1>', ebook_content)

                # generate link to next page
                if item.spine_position + 1 < len(oeb_book.spine):
                    nextLink = oeb_book.spine[item.spine_position + 1].href
                    nextLink = relpath(os.path.abspath(nextLink), dir)
                else:
                    nextLink = None

                # generate link to previous page
                if item.spine_position > 0:
                    prevLink = oeb_book.spine[item.spine_position - 1].href
                    prevLink = relpath(os.path.abspath(prevLink), dir)
                else:
                    prevLink = None

                cssLink = relpath(os.path.abspath(css_path), dir)
                tocUrl = relpath(output_file, dir)
                firstContentPageLink = oeb_book.spine[0].href

                # render template
                templite = Templite(template_html_data)
                toc = lambda: self.generate_html_toc(oeb_book, path, output_dir
                                                     )
                t = templite.render(ebookContent=ebook_content,
                                    prevLink=prevLink,
                                    nextLink=nextLink,
                                    has_toc=bool(oeb_book.toc.count()),
                                    toc=toc,
                                    tocUrl=tocUrl,
                                    head_content=head_content,
                                    meta=meta,
                                    cssLink=cssLink,
                                    firstContentPageLink=firstContentPageLink)

                # write html to file
                with open(path, 'wb') as f:
                    f.write(t.encode('utf-8'))
                item.unload_data_from_memory(memory=path)

        zfile = zipfile.ZipFile(output_path, "w")
        zfile.add_dir(output_dir, os.path.basename(output_dir))
        zfile.write(output_file, os.path.basename(output_file),
                    zipfile.ZIP_DEFLATED)

        if opts.extract_to:
            if os.path.exists(opts.extract_to):
                shutil.rmtree(opts.extract_to)
            os.makedirs(opts.extract_to)
            zfile.extractall(opts.extract_to)
            self.log('Zip file extracted to', opts.extract_to)

        zfile.close()

        # cleanup temp dir
        shutil.rmtree(tempdir)
Ejemplo n.º 13
0
class PDFInput(InputFormatPlugin):

    name = 'PDF Input'
    author = 'Kovid Goyal and John Schember'
    description = 'Convert PDF files to HTML'
    file_types = {'pdf'}
    commit_name = 'pdf_input'

    options = {
        OptionRecommendation(name='no_images',
                             recommended_value=False,
                             help='Do not extract images from the document'),
        OptionRecommendation(
            name='unwrap_factor',
            recommended_value=0.45,
            help='Scale used to determine the length at which a line should '
            'be unwrapped. Valid values are a decimal between 0 and 1. The '
            'default is 0.45, just below the median line length.'),
        OptionRecommendation(
            name='new_pdf_engine',
            recommended_value=False,
            help='Use the new PDF conversion engine. Currently not operational.'
        )
    }

    def convert_new(self, stream, accelerators):
        from ebook_converter.ebooks.pdf.pdftohtml import pdftohtml
        from ebook_converter.utils.cleantext import clean_ascii_chars
        from ebook_converter.ebooks.pdf.reflow import PDFDocument

        pdftohtml(os.getcwd(), stream.name, self.opts.no_images, as_xml=True)
        with open('index.xml', 'rb') as f:
            xml = clean_ascii_chars(f.read())
        PDFDocument(xml, self.opts, self.log)
        return os.path.join(os.getcwd(), 'metadata.opf')

    def convert(self, stream, options, file_ext, log, accelerators):
        from ebook_converter.ebooks.metadata.opf2 import OPFCreator
        from ebook_converter.ebooks.pdf.pdftohtml import pdftohtml

        log.debug('Converting file to html...')
        # The main html file will be named index.html
        self.opts, self.log = options, log
        if options.new_pdf_engine:
            return self.convert_new(stream, accelerators)
        pdftohtml(os.getcwd(), stream.name, options.no_images)

        from ebook_converter.ebooks.metadata.meta import get_metadata
        log.debug('Retrieving document metadata...')
        mi = get_metadata(stream, 'pdf')
        opf = OPFCreator(os.getcwd(), mi)

        manifest = [('index.html', None)]

        images = os.listdir(os.getcwd())
        images.remove('index.html')
        for i in images:
            manifest.append((i, None))
        log.debug('Generating manifest...')
        opf.create_manifest(manifest)

        opf.create_spine(['index.html'])
        log.debug('Rendering manifest...')
        with open('metadata.opf', 'wb') as opffile:
            opf.render(opffile)
        if os.path.exists('toc.ncx'):
            ncxid = opf.manifest.id_for_path('toc.ncx')
            if ncxid:
                with open('metadata.opf', 'r+b') as f:
                    raw = f.read().replace(
                        b'<spine',
                        b'<spine toc="%s"' % polyglot.as_bytes(ncxid))
                    f.seek(0)
                    f.write(raw)

        return os.path.join(os.getcwd(), 'metadata.opf')
Ejemplo n.º 14
0
class HTMLZOutput(OutputFormatPlugin):

    name = 'HTMLZ Output'
    author = 'John Schember'
    file_type = 'htmlz'
    commit_name = 'htmlz_output'
    ui_data = {'css_choices': {'class': 'Use CSS classes',
                               'inline': 'Use the style attribute',
                               'tag': 'Use HTML tags wherever possible'},
               'sheet_choices': {'external': 'Use an external CSS file',
                                 'inline': 'Use a <style> tag in the HTML '
                                 'file'}}

    options = {
        OptionRecommendation(name='htmlz_css_type', recommended_value='class',
            level=OptionRecommendation.LOW,
            choices=list(ui_data['css_choices']),
            help='Specify the handling of CSS. Default is class.\n'
                 'class: {class}\n'
                 'inline: {inline}\n'
                 'tag: {tag}'.format(**ui_data['css_choices'])),
        OptionRecommendation(name='htmlz_class_style', recommended_value='external',
            level=OptionRecommendation.LOW,
            choices=list(ui_data['sheet_choices']),
            help='How to handle the CSS when using css-type = \'class\'.\n'
                 'Default is external.\n'
                 'external: {external}\n'
                 'inline: {inline}'.format(**ui_data['sheet_choices'])),
        OptionRecommendation(name='htmlz_title_filename',
            recommended_value=False, level=OptionRecommendation.LOW,
            help='If set this option causes the file name of the HTML file '
                 'inside the HTMLZ archive to be based on the book title.'
        )
    }

    def convert(self, oeb_book, output_path, input_plugin, opts, log):
        from lxml import etree
        from ebook_converter.ebooks.oeb.base import OEB_IMAGES, SVG_MIME
        from ebook_converter.ebooks.metadata.opf2 import OPF, metadata_to_opf
        from ebook_converter.utils.zipfile import ZipFile
        from ebook_converter.utils.filenames import ascii_filename

        # HTML
        if opts.htmlz_css_type == 'inline':
            from ebook_converter.ebooks.htmlz.oeb2html import OEB2HTMLInlineCSSizer
            OEB2HTMLizer = OEB2HTMLInlineCSSizer
        elif opts.htmlz_css_type == 'tag':
            from ebook_converter.ebooks.htmlz.oeb2html import OEB2HTMLNoCSSizer
            OEB2HTMLizer = OEB2HTMLNoCSSizer
        else:
            from ebook_converter.ebooks.htmlz.oeb2html import OEB2HTMLClassCSSizer as OEB2HTMLizer

        with TemporaryDirectory(u'_htmlz_output') as tdir:
            htmlizer = OEB2HTMLizer(log)
            html = htmlizer.oeb2html(oeb_book, opts)

            fname = u'index'
            if opts.htmlz_title_filename:
                from ebook_converter.utils.filenames import shorten_components_to
                fname = shorten_components_to(100, (ascii_filename(str(oeb_book.metadata.title[0])),))[0]
            with open(os.path.join(tdir, fname+u'.html'), 'wb') as tf:
                if isinstance(html, str):
                    html = html.encode('utf-8')
                tf.write(html)

            # CSS
            if opts.htmlz_css_type == 'class' and opts.htmlz_class_style == 'external':
                with open(os.path.join(tdir, u'style.css'), 'wb') as tf:
                    tf.write(htmlizer.get_css(oeb_book))

            # Images
            images = htmlizer.images
            if images:
                if not os.path.exists(os.path.join(tdir, u'images')):
                    os.makedirs(os.path.join(tdir, u'images'))
                for item in oeb_book.manifest:
                    if item.media_type in OEB_IMAGES and item.href in images:
                        if item.media_type == SVG_MIME:
                            data = etree.tostring(item.data, encoding='unicode')
                        else:
                            data = item.data
                        fname = os.path.join(tdir, u'images', images[item.href])
                        with open(fname, 'wb') as img:
                            img.write(data)

            # Cover
            cover_path = None
            try:
                cover_data = None
                if oeb_book.metadata.cover:
                    term = oeb_book.metadata.cover[0].term
                    cover_data = oeb_book.guide[term].item.data
                if cover_data:
                    from ebook_converter.utils.img import save_cover_data_to
                    cover_path = os.path.join(tdir, u'cover.jpg')
                    with open(cover_path, 'w') as cf:
                        cf.write('')
                    save_cover_data_to(cover_data, cover_path)
            except:
                import traceback
                traceback.print_exc()

            # Metadata
            with open(os.path.join(tdir, u'metadata.opf'), 'wb') as mdataf:
                opf = OPF(io.BytesIO(etree.tostring(oeb_book.metadata.to_opf1(), encoding='UTF-8')))
                mi = opf.to_book_metadata()
                if cover_path:
                    mi.cover = u'cover.jpg'
                mdataf.write(metadata_to_opf(mi))

            htmlz = ZipFile(output_path, 'w')
            htmlz.add_dir(tdir)
Ejemplo n.º 15
0
class AZW3Output(OutputFormatPlugin):

    name = 'AZW3 Output'
    author = 'Kovid Goyal'
    file_type = 'azw3'
    commit_name = 'azw3_output'

    options = {
        OptionRecommendation(
            name='prefer_author_sort',
            recommended_value=False,
            level=OptionRecommendation.LOW,
            help='When present, use author sort field as author.'),
        OptionRecommendation(
            name='no_inline_toc',
            recommended_value=False,
            level=OptionRecommendation.LOW,
            help='Don\'t add Table of Contents to the book. Useful if '
            'the book has its own table of contents.'),
        OptionRecommendation(
            name='toc_title',
            recommended_value=None,
            help='Title for any generated in-line table of contents.'),
        OptionRecommendation(name='dont_compress',
                             recommended_value=False,
                             level=OptionRecommendation.LOW,
                             help='Disable compression of the file contents.'),
        OptionRecommendation(
            name='mobi_toc_at_start',
            recommended_value=False,
            help=
            'When adding the Table of Contents to the book, add it at the start of the '
            'book instead of the end. Not recommended.'),
        OptionRecommendation(
            name='extract_to',
            help='Extract the contents of the generated %s file to the '
            'specified directory. The contents of the directory are first '
            'deleted, so be careful.' % 'AZW3'),
        OptionRecommendation(
            name='share_not_sync',
            recommended_value=False,
            help='Enable sharing of book content via Facebook etc. '
            ' on the Kindle. WARNING: Using this feature means that '
            ' the book will not auto sync its last read position '
            ' on multiple devices. Complain to Amazon.')
    }

    def convert(self, oeb, output_path, input_plugin, opts, log):
        from ebook_converter.ebooks.mobi.writer2.resources import Resources
        from ebook_converter.ebooks.mobi.writer8.main import create_kf8_book
        from ebook_converter.ebooks.mobi.writer8.cleanup import remove_duplicate_anchors

        self.oeb, self.opts, self.log = oeb, opts, log
        opts.mobi_periodical = self.is_periodical
        passthrough = getattr(opts, 'mobi_passthrough', False)
        remove_duplicate_anchors(oeb)

        resources = Resources(self.oeb,
                              self.opts,
                              self.is_periodical,
                              add_fonts=True,
                              process_images=False)
        if not passthrough:
            remove_html_cover(self.oeb, self.log)

            # Split on pagebreaks so that the resulting KF8 is faster to load
            from ebook_converter.ebooks.oeb.transforms.split import Split
            Split()(self.oeb, self.opts)

        kf8 = create_kf8_book(self.oeb, self.opts, resources, for_joint=False)

        kf8.write(output_path)
        extract_mobi(output_path, opts)

    def specialize_css_for_output(self, log, opts, item, stylizer):
        from ebook_converter.ebooks.mobi.writer8.cleanup import CSSCleanup
        CSSCleanup(log, opts)(item, stylizer)
Ejemplo n.º 16
0
class LRFOutput(OutputFormatPlugin):

    name = 'LRF Output'
    author = 'Kovid Goyal'
    file_type = 'lrf'
    commit_name = 'lrf_output'

    options = {
        OptionRecommendation(
            name='enable_autorotation',
            recommended_value=False,
            help='Enable auto-rotation of images that are wider than the '
            'screen width.'),
        OptionRecommendation(
            name='wordspace',
            recommended_value=2.5,
            level=OptionRecommendation.LOW,
            help='Set the space between words in pts. Default is %default'),
        OptionRecommendation(
            name='header',
            recommended_value=False,
            help='Add a header to all the pages with title and author.'),
        OptionRecommendation(
            name='header_format',
            recommended_value="%t by %a",
            help='Set the format of the header. %a is replaced by the author '
            'and %t by the title. Default is %default'),
        OptionRecommendation(
            name='header_separation',
            recommended_value=0,
            help='Add extra spacing below the header. Default is %default pt.'
        ),
        OptionRecommendation(
            name='minimum_indent',
            recommended_value=0,
            help='Minimum paragraph indent (the indent of the first line '
            'of a paragraph) in pts. Default: %default'),
        OptionRecommendation(name='render_tables_as_images',
                             recommended_value=False,
                             help='This option has no effect'),
        OptionRecommendation(
            name='text_size_multiplier_for_rendered_tables',
            recommended_value=1.0,
            help='Multiply the size of text in rendered tables by this '
            'factor. Default is %default'),
        OptionRecommendation(name='serif_family',
                             recommended_value=None,
                             help='The serif family of fonts to embed'),
        OptionRecommendation(name='sans_family',
                             recommended_value=None,
                             help='The sans-serif family of fonts to embed'),
        OptionRecommendation(name='mono_family',
                             recommended_value=None,
                             help='The monospace family of fonts to embed'),
    }

    recommendations = {('change_justification', 'original',
                        OptionRecommendation.HIGH)}

    def convert_images(self, pages, opts, wide):
        from ebook_converter.ebooks.lrf.pylrs.pylrs import Book, BookSetting, ImageStream, ImageBlock
        from uuid import uuid4
        from ebook_converter.constants_old import __appname__, __version__

        width, height = (784, 1012) if wide else (584, 754)

        ps = {}
        ps['topmargin'] = 0
        ps['evensidemargin'] = 0
        ps['oddsidemargin'] = 0
        ps['textwidth'] = width
        ps['textheight'] = height
        book = Book(title=opts.title,
                    author=opts.author,
                    bookid=uuid4().hex,
                    publisher='%s %s' % (__appname__, __version__),
                    category='Comic',
                    pagestyledefault=ps,
                    booksetting=BookSetting(screenwidth=width,
                                            screenheight=height))
        for page in pages:
            imageStream = ImageStream(page)
            _page = book.create_page()
            _page.append(
                ImageBlock(refstream=imageStream,
                           blockwidth=width,
                           blockheight=height,
                           xsize=width,
                           ysize=height,
                           x1=width,
                           y1=height))
            book.append(_page)

        book.renderLrf(open(opts.output, 'wb'))

    def flatten_toc(self):
        from ebook_converter.ebooks.oeb.base import TOC
        nroot = TOC()
        for x in self.oeb.toc.iterdescendants():
            nroot.add(x.title, x.href)
        self.oeb.toc = nroot

    def convert(self, oeb, output_path, input_plugin, opts, log):
        self.log, self.opts, self.oeb = log, opts, oeb

        lrf_opts = LRFOptions(output_path, opts, oeb)

        if input_plugin.is_image_collection:
            self.convert_images(input_plugin.get_images(), lrf_opts,
                                getattr(opts, 'wide', False))
            return

        self.flatten_toc()

        from ebook_converter.ptempfile import TemporaryDirectory
        with TemporaryDirectory('_lrf_output') as tdir:
            from ebook_converter.customize.ui import plugin_for_output_format
            oeb_output = plugin_for_output_format('oeb')
            oeb_output.convert(oeb, tdir, input_plugin, opts, log)
            opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0]
            from ebook_converter.ebooks.lrf.html.convert_from import process_file
            process_file(os.path.join(tdir, opf), lrf_opts, self.log)
Ejemplo n.º 17
0
class MOBIOutput(OutputFormatPlugin):

    name = 'MOBI Output'
    author = 'Kovid Goyal'
    file_type = 'mobi'
    commit_name = 'mobi_output'
    ui_data = {'file_types': ['old', 'both', 'new']}

    options = {
        OptionRecommendation(
            name='prefer_author_sort',
            recommended_value=False,
            level=OptionRecommendation.LOW,
            help='When present, use author sort field as author.'),
        OptionRecommendation(
            name='no_inline_toc',
            recommended_value=False,
            level=OptionRecommendation.LOW,
            help='Don\'t add Table of Contents to the book. Useful if '
            'the book has its own table of contents.'),
        OptionRecommendation(
            name='toc_title',
            recommended_value=None,
            help='Title for any generated in-line table of contents.'),
        OptionRecommendation(name='dont_compress',
                             recommended_value=False,
                             level=OptionRecommendation.LOW,
                             help='Disable compression of the file contents.'),
        OptionRecommendation(
            name='personal_doc',
            recommended_value='[PDOC]',
            help='Tag for MOBI files to be marked as personal documents.'
            ' This option has no effect on the conversion. It is used'
            ' only when sending MOBI files to a device. If the file'
            ' being sent has the specified tag, it will be marked as'
            ' a personal document when sent to the Kindle.'),
        OptionRecommendation(
            name='mobi_ignore_margins',
            recommended_value=False,
            help='Ignore margins in the input document. If False, then '
            'the MOBI output plugin will try to convert margins specified'
            ' in the input document, otherwise it will ignore them.'),
        OptionRecommendation(
            name='mobi_toc_at_start',
            recommended_value=False,
            help=
            'When adding the Table of Contents to the book, add it at the start of the '
            'book instead of the end. Not recommended.'),
        OptionRecommendation(
            name='extract_to',
            help='Extract the contents of the generated %s file to the '
            'specified directory. The contents of the directory are first '
            'deleted, so be careful.' % 'MOBI'),
        OptionRecommendation(
            name='share_not_sync',
            recommended_value=False,
            help='Enable sharing of book content via Facebook etc. '
            ' on the Kindle. WARNING: Using this feature means that '
            ' the book will not auto sync its last read position '
            ' on multiple devices. Complain to Amazon.'),
        OptionRecommendation(
            name='mobi_keep_original_images',
            recommended_value=False,
            help='By default calibre converts all images to JPEG format '
            'in the output MOBI file. This is for maximum compatibility '
            'as some older MOBI viewers have problems with other image '
            'formats. This option tells calibre not to do this. '
            'Useful if your document contains lots of GIF/PNG images that '
            'become very large when converted to JPEG.'),
        OptionRecommendation(
            name='mobi_file_type',
            choices=ui_data['file_types'],
            recommended_value='old',
            help='By default calibre generates MOBI files that contain the '
            'old MOBI 6 format. This format is compatible with all '
            'devices. However, by changing this setting, you can tell '
            'calibre to generate MOBI files that contain both MOBI 6 and '
            'the new KF8 format, or only the new KF8 format. KF8 has '
            'more features than MOBI 6, but only works with newer Kindles. '
            'Allowed values: {}'.format('old, both, new'))
    }

    def check_for_periodical(self):
        if self.is_periodical:
            self.periodicalize_toc()
            self.check_for_masthead()
            self.opts.mobi_periodical = True
        else:
            self.opts.mobi_periodical = False

    def check_for_masthead(self):
        found = 'masthead' in self.oeb.guide
        if not found:
            from ebook_converter.ebooks import generate_masthead
            self.oeb.log.debug('No masthead found in manifest, generating '
                               'default mastheadImage...')
            raw = generate_masthead(str(self.oeb.metadata['title'][0]))
            id, href = self.oeb.manifest.generate('masthead', 'masthead')
            self.oeb.manifest.add(id, href, 'image/gif', data=raw)
            self.oeb.guide.add('masthead', 'Masthead Image', href)
        else:
            self.oeb.log.debug('Using mastheadImage supplied in manifest...')

    def periodicalize_toc(self):
        from ebook_converter.ebooks.oeb.base import TOC
        toc = self.oeb.toc
        if not toc or len(self.oeb.spine) < 3:
            return
        if toc and toc[0].klass != 'periodical':
            one, two = self.oeb.spine[0], self.oeb.spine[1]
            self.log.info('Converting TOC for MOBI periodical indexing...')

            articles = {}
            if toc.depth() < 3:
                # single section periodical
                self.oeb.manifest.remove(one)
                self.oeb.manifest.remove(two)
                sections = [
                    TOC(klass='section',
                        title='All articles',
                        href=self.oeb.spine[0].href)
                ]
                for x in toc:
                    sections[0].nodes.append(x)
            else:
                # multi-section periodical
                self.oeb.manifest.remove(one)
                sections = list(toc)
                for i, x in enumerate(sections):
                    x.klass = 'section'
                    articles_ = list(x)
                    if articles_:
                        self.oeb.manifest.remove(
                            self.oeb.manifest.hrefs[x.href])
                        x.href = articles_[0].href

            for sec in sections:
                articles[id(sec)] = []
                for a in list(sec):
                    a.klass = 'article'
                    articles[id(sec)].append(a)
                    sec.nodes.remove(a)

            root = TOC(klass='periodical',
                       href=self.oeb.spine[0].href,
                       title=str(self.oeb.metadata.title[0]))

            for s in sections:
                if articles[id(s)]:
                    for a in articles[id(s)]:
                        s.nodes.append(a)
                    root.nodes.append(s)

            for x in list(toc.nodes):
                toc.nodes.remove(x)

            toc.nodes.append(root)

            # Fix up the periodical href to point to first section href
            toc.nodes[0].href = toc.nodes[0].nodes[0].href

    def convert(self, oeb, output_path, input_plugin, opts, log):
        from ebook_converter.ebooks.mobi.writer2.resources import Resources
        self.log, self.opts, self.oeb = log, opts, oeb

        mobi_type = opts.mobi_file_type
        if self.is_periodical:
            mobi_type = 'old'  # Amazon does not support KF8 periodicals
        create_kf8 = mobi_type in ('new', 'both')

        remove_html_cover(self.oeb, self.log)
        resources = Resources(oeb,
                              opts,
                              self.is_periodical,
                              add_fonts=create_kf8)
        self.check_for_periodical()

        if create_kf8:
            from ebook_converter.ebooks.mobi.writer8.cleanup import remove_duplicate_anchors
            remove_duplicate_anchors(self.oeb)
            # Split on pagebreaks so that the resulting KF8 is faster to load
            from ebook_converter.ebooks.oeb.transforms.split import Split
            Split()(self.oeb, self.opts)

        kf8 = self.create_kf8(resources, for_joint=mobi_type
                              == 'both') if create_kf8 else None
        if mobi_type == 'new':
            kf8.write(output_path)
            extract_mobi(output_path, opts)
            return

        self.log.info('Creating MOBI 6 output')
        self.write_mobi(input_plugin, output_path, kf8, resources)

    def create_kf8(self, resources, for_joint=False):
        from ebook_converter.ebooks.mobi.writer8.main import create_kf8_book
        return create_kf8_book(self.oeb,
                               self.opts,
                               resources,
                               for_joint=for_joint)

    def write_mobi(self, input_plugin, output_path, kf8, resources):
        from ebook_converter.ebooks.mobi.mobiml import MobiMLizer
        from ebook_converter.ebooks.oeb.transforms.manglecase import CaseMangler
        from ebook_converter.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable
        from ebook_converter.ebooks.oeb.transforms.htmltoc import HTMLTOCAdder
        from ebook_converter.customize.ui import plugin_for_input_format

        opts, oeb = self.opts, self.oeb
        if not opts.no_inline_toc:
            tocadder = HTMLTOCAdder(
                title=opts.toc_title,
                position='start' if opts.mobi_toc_at_start else 'end')
            tocadder(oeb, opts)
        mangler = CaseMangler()
        mangler(oeb, opts)
        try:
            rasterizer = SVGRasterizer()
            rasterizer(oeb, opts)
        except Unavailable:
            self.log.warning('SVG rasterizer unavailable, SVG will not be '
                             'converted')
        else:
            # Add rasterized SVG images
            resources.add_extra_images()
        if hasattr(self.oeb, 'inserted_metadata_jacket'):
            self.workaround_fire_bugs(self.oeb.inserted_metadata_jacket)
        mobimlizer = MobiMLizer(ignore_tables=opts.linearize_tables)
        mobimlizer(oeb, opts)
        write_page_breaks_after_item = input_plugin is not plugin_for_input_format(
            'cbz')
        from ebook_converter.ebooks.mobi.writer2.main import MobiWriter
        writer = MobiWriter(
            opts,
            resources,
            kf8,
            write_page_breaks_after_item=write_page_breaks_after_item)
        writer(oeb, output_path)
        extract_mobi(output_path, opts)

    def specialize_css_for_output(self, log, opts, item, stylizer):
        from ebook_converter.ebooks.mobi.writer8.cleanup import CSSCleanup
        CSSCleanup(log, opts)(item, stylizer)

    def workaround_fire_bugs(self, jacket):
        # The idiotic Fire crashes when trying to render the table used to
        # layout the jacket
        from ebook_converter.ebooks.oeb.base import XHTML
        for table in jacket.data.xpath('//*[local-name()="table"]'):
            table.tag = XHTML('div')
            for tr in table.xpath('descendant::*[local-name()="tr"]'):
                cols = tr.xpath('descendant::*[local-name()="td"]')
                tr.tag = XHTML('div')
                for td in cols:
                    td.tag = XHTML('span' if cols else 'div')
Ejemplo n.º 18
0
class TXTOutput(OutputFormatPlugin):

    name = 'TXT Output'
    author = 'John Schember'
    file_type = 'txt'
    commit_name = 'txt_output'
    ui_data = {
        'newline_types': NEWLINE_TYPES,
        'formatting_types': {
            'plain': 'Plain text',
            'markdown': 'Markdown formatted text',
            'textile': 'TexTile formatted text'
        },
    }

    options = {
        OptionRecommendation(
            name='newline',
            recommended_value='system',
            level=OptionRecommendation.LOW,
            short_switch='n',
            choices=NEWLINE_TYPES,
            help=
            'Type of newline to use. Options are %s. Default is \'system\'. '
            'Use \'old_mac\' for compatibility with Mac OS 9 and earlier. '
            'For macOS use \'unix\'. \'system\' will default to the newline '
            'type used by this OS.' % sorted(NEWLINE_TYPES)),
        OptionRecommendation(
            name='txt_output_encoding',
            recommended_value='utf-8',
            level=OptionRecommendation.LOW,
            help='Specify the character encoding of the output document. '
            'The default is utf-8.'),
        OptionRecommendation(
            name='inline_toc',
            recommended_value=False,
            level=OptionRecommendation.LOW,
            help='Add Table of Contents to beginning of the book.'),
        OptionRecommendation(
            name='max_line_length',
            recommended_value=0,
            level=OptionRecommendation.LOW,
            help='The maximum number of characters per line. This splits on '
            'the first space before the specified value. If no space is '
            'found the line will be broken at the space after and will '
            'exceed the specified value. Also, there is a minimum of 25 '
            'characters. Use 0 to disable line splitting.'),
        OptionRecommendation(
            name='force_max_line_length',
            recommended_value=False,
            level=OptionRecommendation.LOW,
            help='Force splitting on the max-line-length value when no space '
            'is present. Also allows max-line-length to be below the '
            'minimum'),
        OptionRecommendation(name='txt_output_formatting',
                             recommended_value='plain',
                             choices=list(ui_data['formatting_types']),
                             help='Formatting used within the document.\n'
                             '* plain: {plain}\n'
                             '* markdown: {markdown}\n'
                             '* textile: {textile}'
                             ''.format(**ui_data['formatting_types'])),
        OptionRecommendation(
            name='keep_links',
            recommended_value=False,
            level=OptionRecommendation.LOW,
            help='Do not remove links within the document. This is only '
            'useful when paired with a txt-output-formatting option that '
            'is not none because links are always removed with plain '
            'text output.'),
        OptionRecommendation(
            name='keep_image_references',
            recommended_value=False,
            level=OptionRecommendation.LOW,
            help='Do not remove image references within the document. This is '
            'only useful when paired with a txt-output-formatting option '
            'that is not none because links are always removed with '
            'plain text output.'),
        OptionRecommendation(
            name='keep_color',
            recommended_value=False,
            level=OptionRecommendation.LOW,
            help='Do not remove font color from output. This is only useful '
            'when txt-output-formatting is set to textile. Textile is '
            'the only formatting that supports setting font color. If '
            'this option is not specified font color will not be set and '
            'default to the color displayed by the reader (generally '
            'this is black).')
    }

    def convert(self, oeb_book, output_path, input_plugin, opts, log):
        from ebook_converter.ebooks.txt.txtml import TXTMLizer
        from ebook_converter.utils.cleantext import clean_ascii_chars
        from ebook_converter.ebooks.txt.newlines import specified_newlines, TxtNewlines

        if opts.txt_output_formatting.lower() == 'markdown':
            from ebook_converter.ebooks.txt.markdownml import MarkdownMLizer
            self.writer = MarkdownMLizer(log)
        elif opts.txt_output_formatting.lower() == 'textile':
            from ebook_converter.ebooks.txt.textileml import TextileMLizer
            self.writer = TextileMLizer(log)
        else:
            self.writer = TXTMLizer(log)

        txt = self.writer.extract_content(oeb_book, opts)
        txt = clean_ascii_chars(txt)

        log.debug('\tReplacing newlines with selected type...')
        txt = specified_newlines(TxtNewlines(opts.newline).newline, txt)

        close = False
        if not hasattr(output_path, 'write'):
            close = True
            if not os.path.exists(os.path.dirname(
                    output_path)) and os.path.dirname(output_path) != '':
                os.makedirs(os.path.dirname(output_path))
            out_stream = open(output_path, 'wb')
        else:
            out_stream = output_path

        out_stream.seek(0)
        out_stream.truncate()
        out_stream.write(txt.encode(opts.txt_output_encoding, 'replace'))

        if close:
            out_stream.close()
Ejemplo n.º 19
0
class HTMLInput(InputFormatPlugin):

    name = 'HTML Input'
    author = 'Kovid Goyal'
    description = 'Convert HTML and OPF files to an OEB'
    file_types = {'opf', 'html', 'htm', 'xhtml', 'xhtm', 'shtm', 'shtml'}
    commit_name = 'html_input'

    options = {
        OptionRecommendation(name='breadth_first',
            recommended_value=False, level=OptionRecommendation.LOW,
            help='Traverse links in HTML files breadth first. Normally, '
                    'they are traversed depth first.'
        ),

        OptionRecommendation(name='max_levels',
            recommended_value=5, level=OptionRecommendation.LOW,
            help='Maximum levels of recursion when following links in '
                   'HTML files. Must be non-negative. 0 implies that no '
                   'links in the root HTML file are followed. Default is '
                   '%default.'
        ),

        OptionRecommendation(name='dont_package',
            recommended_value=False, level=OptionRecommendation.LOW,
            help='Normally this input plugin re-arranges all the input '
                'files into a standard folder hierarchy. Only use this option '
                'if you know what you are doing as it can result in various '
                'nasty side effects in the rest of the conversion pipeline.'
        ),

    }

    def convert(self, stream, opts, file_ext, log,
                accelerators):
        basedir = os.getcwd()
        self.opts = opts

        fname = None
        if hasattr(stream, 'name'):
            basedir = os.path.dirname(stream.name)
            fname = os.path.basename(stream.name)

        if file_ext != 'opf':
            if opts.dont_package:
                raise ValueError('The --dont-package option is not supported for an HTML input file')
            from ebook_converter.ebooks.metadata.html import get_metadata
            mi = get_metadata(stream)
            if fname:
                from ebook_converter.ebooks.metadata.meta import metadata_from_filename
                fmi = metadata_from_filename(fname)
                fmi.smart_update(mi)
                mi = fmi
            oeb = self.create_oebbook(stream.name, basedir, opts, log, mi)
            return oeb

        from ebook_converter.ebooks.conversion.plumber import create_oebbook
        return create_oebbook(log, stream.name, opts,
                encoding=opts.input_encoding)

    def create_oebbook(self, htmlpath, basedir, opts, log, mi):
        import uuid
        from ebook_converter.ebooks.conversion.plumber import create_oebbook
        from ebook_converter.ebooks.oeb.base import (DirContainer,
            rewrite_links, urlnormalize, BINARY_MIME, OEB_STYLES,
            xpath, urlquote)
        from ebook_converter.ebooks.oeb.transforms.metadata import \
            meta_info_to_oeb_metadata
        from ebook_converter.ebooks.html.input import get_filelist
        from ebook_converter.ebooks.metadata import string_to_authors
        from ebook_converter.utils.localization import canonicalize_lang
        import css_parser, logging
        css_parser.log.setLevel(logging.WARN)
        self.OEB_STYLES = OEB_STYLES
        oeb = create_oebbook(log, None, opts, self,
                encoding=opts.input_encoding, populate=False)
        self.oeb = oeb

        metadata = oeb.metadata
        meta_info_to_oeb_metadata(mi, metadata, log)
        if not metadata.language:
            l = canonicalize_lang(getattr(opts, 'language', None))
            if not l:
                oeb.logger.warn('Language not specified')
                l = get_lang().replace('_', '-')
            metadata.add('language', l)
        if not metadata.creator:
            a = getattr(opts, 'authors', None)
            if a:
                a = string_to_authors(a)
            if not a:
                oeb.logger.warn('Creator not specified')
                a = [self.oeb.translate('Unknown')]
            for aut in a:
                metadata.add('creator', aut)
        if not metadata.title:
            oeb.logger.warn('Title not specified')
            metadata.add('title', self.oeb.translate('Unknown'))
        bookid = str(uuid.uuid4())
        metadata.add('identifier', bookid, id='uuid_id', scheme='uuid')
        for ident in metadata.identifier:
            if 'id' in ident.attrib:
                self.oeb.uid = metadata.identifier[0]
                break

        filelist = get_filelist(htmlpath, basedir, opts, log)
        filelist = [f for f in filelist if not f.is_binary]
        htmlfile_map = {}
        for f in filelist:
            path = f.path
            oeb.container = DirContainer(os.path.dirname(path), log,
                    ignore_opf=True)
            bname = os.path.basename(path)
            id, href = oeb.manifest.generate(id='html', href=sanitize_file_name(bname))
            htmlfile_map[path] = href
            item = oeb.manifest.add(id, href, 'text/html')
            if path == htmlpath and '%' in path:
                bname = urlquote(bname)
            item.html_input_href = bname
            oeb.spine.add(item, True)

        self.added_resources = {}
        self.log = log
        self.log('Normalizing filename cases')
        for path, href in htmlfile_map.items():
            self.added_resources[path] = href
        self.urlnormalize, self.DirContainer = urlnormalize, DirContainer
        self.urldefrag = urllib.parse.urldefrag
        self.BINARY_MIME = BINARY_MIME

        self.log('Rewriting HTML links')
        for f in filelist:
            path = f.path
            dpath = os.path.dirname(path)
            oeb.container = DirContainer(dpath, log, ignore_opf=True)
            href = htmlfile_map[path]
            try:
                item = oeb.manifest.hrefs[href]
            except KeyError:
                item = oeb.manifest.hrefs[urlnormalize(href)]
            rewrite_links(item.data,
                          functools.partial(self.resource_adder, base=dpath))

        for item in oeb.manifest.values():
            if item.media_type in self.OEB_STYLES:
                dpath = None
                for path, href in self.added_resources.items():
                    if href == item.href:
                        dpath = os.path.dirname(path)
                        break
                css_parser.replaceUrls(item.data,
                        functools.partial(self.resource_adder, base=dpath))

        toc = self.oeb.toc
        self.oeb.auto_generated_toc = True
        titles = []
        headers = []
        for item in self.oeb.spine:
            if not item.linear:
                continue
            html = item.data
            title = ''.join(xpath(html, '/h:html/h:head/h:title/text()'))
            title = re.sub(r'\s+', ' ', title.strip())
            if title:
                titles.append(title)
            headers.append('(unlabled)')
            for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'):
                expr = '/h:html/h:body//h:%s[position()=1]/text()'
                header = ''.join(xpath(html, expr % tag))
                header = re.sub(r'\s+', ' ', header.strip())
                if header:
                    headers[-1] = header
                    break
        use = titles
        if len(titles) > len(set(titles)):
            use = headers
        for title, item in zip(use, self.oeb.spine):
            if not item.linear:
                continue
            toc.add(title, item.href)

        oeb.container = DirContainer(os.getcwd(), oeb.log, ignore_opf=True)
        return oeb

    def link_to_local_path(self, link_, base=None):
        from ebook_converter.ebooks.html.input import Link
        if not isinstance(link_, str):
            try:
                link_ = link_.decode('utf-8', 'error')
            except:
                self.log.warn('Failed to decode link %r. Ignoring'%link_)
                return None, None
        try:
            l = Link(link_, base if base else os.getcwd())
        except:
            self.log.exception('Failed to process link: %r'%link_)
            return None, None
        if l.path is None:
            # Not a local resource
            return None, None
        link = l.path.replace('/', os.sep).strip()
        frag = l.fragment
        if not link:
            return None, None
        return link, frag

    def resource_adder(self, link_, base=None):
        link, frag = self.link_to_local_path(link_, base=base)
        if link is None:
            return link_
        try:
            if base and not os.path.isabs(link):
                link = os.path.join(base, link)
            link = os.path.abspath(link)
        except:
            return link_
        if not os.access(link, os.R_OK):
            return link_
        if os.path.isdir(link):
            self.log.warn(link_, 'is a link to a directory. Ignoring.')
            return link_
        if link not in self.added_resources:
            bhref = os.path.basename(link)
            id, href = self.oeb.manifest.generate(id='added', href=sanitize_file_name(bhref))
            guessed = mimetypes.guess_type(href)[0]
            media_type = guessed or self.BINARY_MIME
            if media_type == 'text/plain':
                self.log.warn('Ignoring link to text file %r'%link_)
                return None
            if media_type == self.BINARY_MIME:
                # Check for the common case, images
                try:
                    img = what(link)
                except EnvironmentError:
                    pass
                else:
                    if img:
                        media_type = mimetypes.guess_type('dummy.'+img)[0] or self.BINARY_MIME

            self.oeb.log.debug('Added', link)
            self.oeb.container = self.DirContainer(os.path.dirname(link),
                    self.oeb.log, ignore_opf=True)
            # Load into memory
            item = self.oeb.manifest.add(id, href, media_type)
            # bhref refers to an already existing file. The read() method of
            # DirContainer will call unquote on it before trying to read the
            # file, therefore we quote it here.
            # XXX(gryf): why the heck it was changed to bytes?
            item.html_input_href = urllib.parse.quote(bhref)
            if guessed in self.OEB_STYLES:
                item.override_css_fetch = functools.partial(
                        self.css_import_handler, os.path.dirname(link))
            item.data
            self.added_resources[link] = href

        nlink = self.added_resources[link]
        if frag:
            nlink = '#'.join((nlink, frag))
        return nlink

    def css_import_handler(self, base, href):
        link, frag = self.link_to_local_path(href, base=base)
        if link is None or not os.access(link, os.R_OK) or os.path.isdir(link):
            return (None, None)
        try:
            with open(link, 'rb') as f:
                raw = f.read().decode('utf-8', 'replace')
            raw = self.oeb.css_preprocessor(raw, add_namespace=False)
        except:
            self.log.exception('Failed to read CSS file: %r'%link)
            return (None, None)
        return (None, raw)
Ejemplo n.º 20
0
class FB2Input(InputFormatPlugin):

    name = 'FB2 Input'
    author = 'Anatoly Shipitsin'
    description = 'Convert FB2 and FBZ files to HTML'
    file_types = {'fb2', 'fbz'}
    commit_name = 'fb2_input'

    recommendations = {('level1_toc', '//h:h1', OptionRecommendation.MED),
                       ('level2_toc', '//h:h2', OptionRecommendation.MED),
                       ('level3_toc', '//h:h3', OptionRecommendation.MED)}

    options = {
        OptionRecommendation(name='no_inline_fb2_toc',
                             recommended_value=False,
                             level=OptionRecommendation.LOW,
                             help='Do not insert a Table of Contents '
                             'at the beginning of the book.')
    }

    def convert(self, stream, options, file_ext, log, accelerators):
        from ebook_converter.ebooks.metadata.fb2 import ensure_namespace
        from ebook_converter.ebooks.metadata.fb2 import get_fb2_data
        from ebook_converter.ebooks.metadata.opf2 import OPFCreator
        from ebook_converter.ebooks.metadata.meta import get_metadata
        from ebook_converter.ebooks.chardet import xml_to_unicode
        self.log = log
        log.debug('Parsing XML...')
        raw = get_fb2_data(stream)[0]
        raw = raw.replace(b'\0', b'')
        raw = xml_to_unicode(raw,
                             strip_encoding_pats=True,
                             assume_utf8=True,
                             resolve_entities=True)[0]
        try:
            doc = etree.fromstring(raw)
        except etree.XMLSyntaxError:
            doc = etree.fromstring(raw.replace('& ', '&amp;'))
        if doc is None:
            raise ValueError('The FB2 file is not valid XML')
        doc = ensure_namespace(doc)
        try:
            fb_ns = doc.nsmap[doc.prefix]
        except Exception:
            fb_ns = FB2NS

        NAMESPACES = {'f': fb_ns, 'l': const.XLINK_NS}
        stylesheets = doc.xpath('//*[local-name() = "stylesheet" and '
                                '@type="text/css"]')
        css = ''
        for s in stylesheets:
            css += etree.tostring(
                s, encoding='unicode', method='text', with_tail=False) + '\n\n'
        if css:
            import css_parser
            import logging
            parser = css_parser.CSSParser(fetcher=None,
                                          log=logging.getLogger('calibre.css'))

            XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % const.XHTML_NS
            text = XHTML_CSS_NAMESPACE + css
            log.debug('Parsing stylesheet...')
            stylesheet = parser.parseString(text)
            stylesheet.namespaces['h'] = const.XHTML_NS
            css = stylesheet.cssText
            if isinstance(css, bytes):
                css = css.decode('utf-8', 'replace')
            css = css.replace('h|style', 'h|span')
            css = re.sub(r'name\s*=\s*', 'class=', css)
        self.extract_embedded_content(doc)
        log.debug('Converting XML to HTML...')
        with open(
                pkg_resources.resource_filename('ebook_converter',
                                                'data/fb2.xsl')) as f:
            ss = f.read()
        ss = ss.replace("__FB_NS__", fb_ns)
        if options.no_inline_fb2_toc:
            log.info('Disabling generation of inline FB2 TOC')
            ss = re.compile(r'<!-- BUILD TOC -->.*<!-- END BUILD TOC -->',
                            re.DOTALL).sub('', ss)

        styledoc = etree.fromstring(ss)

        transform = etree.XSLT(styledoc)
        result = transform(doc)

        # Handle links of type note and cite
        notes = {
            a.get('href')[1:]: a
            for a in result.xpath('//a[@link_note and @href]')
            if a.get('href').startswith('#')
        }
        cites = {
            a.get('link_cite'): a
            for a in result.xpath('//a[@link_cite]') if not a.get('href', '')
        }
        all_ids = {x for x in result.xpath('//*/@id')}
        for cite, a in cites.items():
            note = notes.get(cite, None)
            if note:
                c = 1
                while 'cite%d' % c in all_ids:
                    c += 1
                if not note.get('id', None):
                    note.set('id', 'cite%d' % c)
                    all_ids.add(note.get('id'))
                a.set('href', '#%s' % note.get('id'))
        for x in result.xpath('//*[@link_note or @link_cite]'):
            x.attrib.pop('link_note', None)
            x.attrib.pop('link_cite', None)

        for img in result.xpath('//img[@src]'):
            src = img.get('src')
            img.set('src', self.binary_map.get(src, src))
        index = transform.tostring(result)
        with open('index.xhtml', 'wb') as f:
            f.write(index.encode('utf-8'))
        with open('inline-styles.css', 'wb') as f:
            f.write(css.encode('utf-8'))
        stream.seek(0)
        mi = get_metadata(stream, 'fb2')
        if not mi.title:
            mi.title = 'Unknown'
        if not mi.authors:
            mi.authors = ['Unknown']
        cpath = None
        if mi.cover_data and mi.cover_data[1]:
            with open('fb2_cover_calibre_mi.jpg', 'wb') as f:
                f.write(mi.cover_data[1])
            cpath = os.path.abspath('fb2_cover_calibre_mi.jpg')
        else:
            for img in doc.xpath('//f:coverpage/f:image',
                                 namespaces=NAMESPACES):
                href = img.get('{%s}href' % const.XLINK_NS,
                               img.get('href', None))
                if href is not None:
                    if href.startswith('#'):
                        href = href[1:]
                    cpath = os.path.abspath(href)
                    break

        opf = OPFCreator(os.getcwd(), mi)
        entries = [(f2, mimetypes.guess_type(f2)[0])
                   for f2 in os.listdir(u'.')]
        opf.create_manifest(entries)
        opf.create_spine(['index.xhtml'])
        if cpath:
            opf.guide.set_cover(cpath)
        with open('metadata.opf', 'wb') as f:
            opf.render(f)
        return os.path.join(os.getcwd(), 'metadata.opf')

    def extract_embedded_content(self, doc):
        from ebook_converter.ebooks.fb2 import base64_decode
        self.binary_map = {}
        for elem in doc.xpath('./*'):
            if elem.text and 'binary' in elem.tag and 'id' in elem.attrib:
                ct = elem.get('content-type', '')
                fname = elem.attrib['id']
                ext = ct.rpartition('/')[-1].lower()
                if ext in ('png', 'jpeg', 'jpg'):
                    if fname.lower().rpartition('.')[-1] not in {
                            'jpg', 'jpeg', 'png'
                    }:
                        fname += '.' + ext
                    self.binary_map[elem.get('id')] = fname
                raw = elem.text.strip()
                try:
                    data = base64_decode(raw)
                except TypeError:
                    self.log.exception(
                        'Binary data with id=%s is corrupted, '
                        'ignoring', elem.get('id'))
                else:
                    with open(fname, 'wb') as f:
                        f.write(data)
Ejemplo n.º 21
0
class FB2Output(OutputFormatPlugin):

    name = 'FB2 Output'
    author = 'John Schember'
    file_type = 'fb2'
    commit_name = 'fb2_output'

    FB2_GENRES = [
        # Science Fiction & Fantasy
        'sf_history',  # Alternative history
        'sf_action',  # Action
        'sf_epic',  # Epic
        'sf_heroic',  # Heroic
        'sf_detective',  # Detective
        'sf_cyberpunk',  # Cyberpunk
        'sf_space',  # Space
        'sf_social',  # Social#philosophical
        'sf_horror',  # Horror & mystic
        'sf_humor',  # Humor
        'sf_fantasy',  # Fantasy
        'sf',  # Science Fiction
        # Detectives & Thrillers
        'det_classic',  # Classical detectives
        'det_police',  # Police Stories
        'det_action',  # Action
        'det_irony',  # Ironical detectives
        'det_history',  # Historical detectives
        'det_espionage',  # Espionage detectives
        'det_crime',  # Crime detectives
        'det_political',  # Political detectives
        'det_maniac',  # Maniacs
        'det_hard',  # Hard#boiled
        'thriller',  # Thrillers
        'detective',  # Detectives
        # Prose
        'prose_classic',  # Classics prose
        'prose_history',  # Historical prose
        'prose_contemporary',  # Contemporary prose
        'prose_counter',  # Counterculture
        'prose_rus_classic',  # Russial classics prose
        'prose_su_classics',  # Soviet classics prose
        # Romance
        'love_contemporary',  # Contemporary Romance
        'love_history',  # Historical Romance
        'love_detective',  # Detective Romance
        'love_short',  # Short Romance
        'love_erotica',  # Erotica
        # Adventure
        'adv_western',  # Western
        'adv_history',  # History
        'adv_indian',  # Indians
        'adv_maritime',  # Maritime Fiction
        'adv_geo',  # Travel & geography
        'adv_animal',  # Nature & animals
        'adventure',  # Other
        # Children's
        'child_tale',  # Fairy Tales
        'child_verse',  # Verses
        'child_prose',  # Prose
        'child_sf',  # Science Fiction
        'child_det',  # Detectives & Thrillers
        'child_adv',  # Adventures
        'child_education',  # Educational
        'children',  # Other
        # Poetry & Dramaturgy
        'poetry',  # Poetry
        'dramaturgy',  # Dramaturgy
        # Antique literature
        'antique_ant',  # Antique
        'antique_european',  # European
        'antique_russian',  # Old russian
        'antique_east',  # Old east
        'antique_myths',  # Myths. Legends. Epos
        'antique',  # Other
        # Scientific#educational
        'sci_history',  # History
        'sci_psychology',  # Psychology
        'sci_culture',  # Cultural science
        'sci_religion',  # Religious studies
        'sci_philosophy',  # Philosophy
        'sci_politics',  # Politics
        'sci_business',  # Business literature
        'sci_juris',  # Jurisprudence
        'sci_linguistic',  # Linguistics
        'sci_medicine',  # Medicine
        'sci_phys',  # Physics
        'sci_math',  # Mathematics
        'sci_chem',  # Chemistry
        'sci_biology',  # Biology
        'sci_tech',  # Technical
        'science',  # Other
        # Computers & Internet
        'comp_www',  # Internet
        'comp_programming',  # Programming
        'comp_hard',  # Hardware
        'comp_soft',  # Software
        'comp_db',  # Databases
        'comp_osnet',  # OS & Networking
        'computers',  # Other
        # Reference
        'ref_encyc',  # Encyclopedias
        'ref_dict',  # Dictionaries
        'ref_ref',  # Reference
        'ref_guide',  # Guidebooks
        'reference',  # Other
        # Nonfiction
        'nonf_biography',  # Biography & Memoirs
        'nonf_publicism',  # Publicism
        'nonf_criticism',  # Criticism
        'design',  # Art & design
        'nonfiction',  # Other
        # Religion & Inspiration
        'religion_rel',  # Religion
        'religion_esoterics',  # Esoterics
        'religion_self',  # Self#improvement
        'religion',  # Other
        # Humor
        'humor_anecdote',  # Anecdote (funny stories)
        'humor_prose',  # Prose
        'humor_verse',  # Verses
        'humor',  # Other
        # Home & Family
        'home_cooking',  # Cooking
        'home_pets',  # Pets
        'home_crafts',  # Hobbies & Crafts
        'home_entertain',  # Entertaining
        'home_health',  # Health
        'home_garden',  # Garden
        'home_diy',  # Do it yourself
        'home_sport',  # Sports
        'home_sex',  # Erotica & sex
        'home',  # Other
    ]
    ui_data = {'sectionize': {'toc': 'Section per entry in the ToC',
                              'files': 'Section per file',
                              'nothing': 'A single section'},
               'genres': FB2_GENRES}

    options = {
        OptionRecommendation(name='sectionize',
            recommended_value='files', level=OptionRecommendation.LOW,
            choices=list(ui_data['sectionize']),
            help='Specify how sections are created:\n'
                 ' * nothing: {nothing}\n'
                 ' * files: {files}\n'
                 ' * toc: {toc}\n'
                 'If ToC based generation fails, adjust the "Structure '
                 'detection" and/or "Table of Contents" settings (turn on '
                 '"Force use of auto-generated Table of Contents")'
                 '.'.format(**ui_data['sectionize'])
        ),
        OptionRecommendation(name='fb2_genre',
            recommended_value='antique', level=OptionRecommendation.LOW,
            choices=FB2_GENRES,
            help='Genre for the book. Choices: %s\n\n See: http://www.'
                 'fictionbook.org/index.php/Eng:FictionBook_2.1_genres for a '
                 'complete list with descriptions.'  % ', '.join(FB2_GENRES)),

    }

    def convert(self, oeb_book, output_path, input_plugin, opts, log):
        from ebook_converter.ebooks.oeb.transforms.jacket import linearize_jacket
        from ebook_converter.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable
        from ebook_converter.ebooks.fb2.fb2ml import FB2MLizer

        try:
            rasterizer = SVGRasterizer()
            rasterizer(oeb_book, opts)
        except Unavailable:
            log.warning('SVG rasterizer unavailable, SVG will not be '
                        'converted')

        linearize_jacket(oeb_book)

        fb2mlizer = FB2MLizer(log)
        fb2_content = fb2mlizer.extract_content(oeb_book, opts)

        close = False
        if not hasattr(output_path, 'write'):
            close = True
            if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
                os.makedirs(os.path.dirname(output_path))
            out_stream = open(output_path, 'wb')
        else:
            out_stream = output_path

        out_stream.seek(0)
        out_stream.truncate()
        out_stream.write(fb2_content.encode('utf-8', 'replace'))

        if close:
            out_stream.close()
Ejemplo n.º 22
0
class PMLOutput(OutputFormatPlugin):

    name = 'PML Output'
    author = 'John Schember'
    file_type = 'pmlz'
    commit_name = 'pml_output'

    options = {
        OptionRecommendation(
            name='pml_output_encoding',
            recommended_value='cp1252',
            level=OptionRecommendation.LOW,
            help='Specify the character encoding of the output document. '
            'The default is cp1252.'),
        OptionRecommendation(
            name='inline_toc',
            recommended_value=False,
            level=OptionRecommendation.LOW,
            help='Add Table of Contents to beginning of the book.'),
        OptionRecommendation(
            name='full_image_depth',
            recommended_value=False,
            level=OptionRecommendation.LOW,
            help='Do not reduce the size or bit depth of images. Images '
            'have their size and depth reduced by default to accommodate '
            'applications that can not convert images on their '
            'own such as Dropbook.'),
    }

    def convert(self, oeb_book, output_path, input_plugin, opts, log):
        from ebook_converter.ebooks.pml.pmlml import PMLMLizer
        from ebook_converter.utils.zipfile import ZipFile

        with TemporaryDirectory('_pmlz_output') as tdir:
            pmlmlizer = PMLMLizer(log)
            pml = str(pmlmlizer.extract_content(oeb_book, opts))
            with open(os.path.join(tdir, 'index.pml'), 'wb') as out:
                out.write(pml.encode(opts.pml_output_encoding, 'replace'))

            img_path = os.path.join(tdir, 'index_img')
            if not os.path.exists(img_path):
                os.makedirs(img_path)
            self.write_images(oeb_book.manifest, pmlmlizer.image_hrefs,
                              img_path, opts)

            log.debug('Compressing output...')
            pmlz = ZipFile(output_path, 'w')
            pmlz.add_dir(tdir)

    def write_images(self, manifest, image_hrefs, out_dir, opts):
        from PIL import Image

        from ebook_converter.ebooks.oeb.base import OEB_RASTER_IMAGES
        for item in manifest:
            if item.media_type in OEB_RASTER_IMAGES and item.href in image_hrefs.keys(
            ):
                if opts.full_image_depth:
                    im = Image.open(io.BytesIO(item.data))
                else:
                    im = Image.open(io.BytesIO(item.data)).convert('P')
                    im.thumbnail((300, 300), Image.ANTIALIAS)

                data = io.BytesIO()
                im.save(data, 'PNG')
                data = data.getvalue()

                path = os.path.join(out_dir, image_hrefs[item.href])

                with open(path, 'wb') as out:
                    out.write(data)