Esempio n. 1
0
class DOCXInput(InputFormatPlugin):
    name = 'DOCX Input'
    author = 'Kovid Goyal'
    description = _('Convert DOCX files (.docx and .docm) to HTML')
    file_types = {'docx', 'docm'}

    options = {
        OptionRecommendation(
            name='docx_no_cover',
            recommended_value=False,
            help=
            _('Normally, if a large image is present at the start of the document that looks like a cover, '
              'it will be removed from the document and used as the cover for created ebook. This option '
              'turns off that behavior.')),
        OptionRecommendation(
            name='docx_no_pagebreaks_between_notes',
            recommended_value=False,
            help=_('Do not insert a page break after every endnote.')),
    }

    recommendations = set([('page_breaks_before', '/',
                            OptionRecommendation.MED)])

    def convert(self, stream, options, file_ext, log, accelerators):
        from calibre.ebooks.docx.to_html import Convert
        return Convert(stream,
                       detect_cover=not options.docx_no_cover,
                       log=log,
                       notes_nopb=options.docx_no_pagebreaks_between_notes)()
Esempio n. 2
0
class PDBOutput(OutputFormatPlugin):

    name = 'PDB Output'
    author = 'John Schember'
    file_type = 'pdb'
    commit_name = 'pdb_output'
    ui_data = {'formats': tuple(ALL_FORMAT_WRITERS)}

    options = set([
        OptionRecommendation(
            name='format',
            recommended_value='doc',
            level=OptionRecommendation.LOW,
            short_switch='f',
            choices=list(ALL_FORMAT_WRITERS),
            help=(_('Format to use inside the pdb container. Choices are:') +
                  ' %s' % sorted(ALL_FORMAT_WRITERS))),
        OptionRecommendation(
            name='pdb_output_encoding',
            recommended_value='cp1252',
            level=OptionRecommendation.LOW,
            help=_(
                'Specify the character encoding of the output document. '
                'The default is cp1252. Note: This option is not honored by all '
                'formats.')),
        OptionRecommendation(
            name='inline_toc',
            recommended_value=False,
            level=OptionRecommendation.LOW,
            help=_('Add Table of Contents to beginning of the book.')),
    ])

    def convert(self, oeb_book, output_path, input_plugin, opts, log):
        close = False
        if not hasattr(output_path, 'write'):
            close = True
            if not os.path.exists(os.path.dirname(
                    output_path)) and os.path.dirname(output_path) != '':
                os.makedirs(os.path.dirname(output_path))
            out_stream = open(output_path, 'wb')
        else:
            out_stream = output_path

        Writer = get_writer(opts.format)

        if Writer is None:
            raise PDBError('No writer available for format %s.' % format)

        setattr(opts, 'max_line_length', 0)
        setattr(opts, 'force_max_line_length', False)

        writer = Writer(opts, log)

        out_stream.seek(0)
        out_stream.truncate()

        writer.write_content(oeb_book, out_stream, oeb_book.metadata)

        if close:
            out_stream.close()
Esempio n. 3
0
class DOCXOutput(OutputFormatPlugin):

    name = 'DOCX Output'
    author = 'Kovid Goyal'
    file_type = 'docx'

    options = {
        OptionRecommendation(name='docx_page_size', recommended_value='letter',
            level=OptionRecommendation.LOW, choices=PAGE_SIZES,
            help=_('The size of the page. Default is letter. Choices '
            'are %s') % PAGE_SIZES),

        OptionRecommendation(name='docx_custom_page_size', recommended_value=None,
            help=_('Custom size of the document. Use the form widthxheight '
            'EG. `123x321` to specify the width and height (in pts). '
            'This overrides any specified page-size.')),

        OptionRecommendation(name='docx_no_cover', recommended_value=False,
            help=_('Do not insert the book cover as an image at the start of the document.'
                   ' If you use this option, the book cover will be discarded.')),

        OptionRecommendation(name='docx_no_toc', recommended_value=False,
            help=_('Do not insert the table of contents as a page at the start of the document.')),

        OptionRecommendation(name='extract_to',
            help=_('Extract the contents of the generated %s file to the '
                'specified directory. The contents of the directory are first '
                'deleted, so be careful.') % 'DOCX'),
    }

    recommendations = {
        ('margin_left', 72.0, OptionRecommendation.MED),
        ('margin_right', 72.0, OptionRecommendation.MED),
        ('margin_top', 72.0, OptionRecommendation.MED),
        ('margin_bottom', 72.0, OptionRecommendation.MED),
    }

    def convert_metadata(self, oeb):
        from lxml import etree
        from calibre.ebooks.oeb.base import OPF, OPF2_NS
        from calibre.ebooks.metadata.opf2 import OPF as ReadOPF
        from io import BytesIO
        package = etree.Element(OPF('package'), attrib={'version': '2.0'}, nsmap={None: OPF2_NS})
        oeb.metadata.to_opf2(package)
        self.mi = ReadOPF(BytesIO(etree.tostring(package, encoding='utf-8')), populate_spine=False, try_to_guess_cover=False).to_book_metadata()

    def convert(self, oeb, output_path, input_plugin, opts, log):
        from calibre.ebooks.docx.writer.container import DOCX
        from calibre.ebooks.docx.writer.from_html import Convert
        docx = DOCX(opts, log)
        self.convert_metadata(oeb)
        Convert(oeb, docx, self.mi, not opts.docx_no_cover, not opts.docx_no_toc)()
        docx.write(output_path, self.mi)
        if opts.extract_to:
            from calibre.ebooks.docx.dump import do_dump
            do_dump(output_path, opts.extract_to)
Esempio n. 4
0
class RBOutput(OutputFormatPlugin):

    name = 'RB Output'
    author = 'John Schember'
    file_type = 'rb'
    commit_name = 'rb_output'

    options = set([
        OptionRecommendation(name='inline_toc',
            recommended_value=False, level=OptionRecommendation.LOW,
            help=_('Add Table of Contents to beginning of the book.')),
    ])

    def convert(self, oeb_book, output_path, input_plugin, opts, log):
        from calibre.ebooks.rb.writer import RBWriter

        close = False
        if not hasattr(output_path, 'write'):
            close = True
            if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
                os.makedirs(os.path.dirname(output_path))
            out_stream = open(output_path, 'wb')
        else:
            out_stream = output_path

        writer = RBWriter(opts, log)

        out_stream.seek(0)
        out_stream.truncate()

        writer.write_content(oeb_book, out_stream, oeb_book.metadata)

        if close:
            out_stream.close()
Esempio n. 5
0
class DOCXOutput(OutputFormatPlugin):

    name = 'DOCX Output'
    author = 'Kovid Goyal'
    file_type = 'docx'

    options = {
        OptionRecommendation(
            name='docx_page_size',
            recommended_value='letter',
            level=OptionRecommendation.LOW,
            choices=PAGE_SIZES,
            help=_('The size of the page. Default is letter. Choices '
                   'are %s') % PAGE_SIZES),
        OptionRecommendation(
            name='docx_custom_page_size',
            recommended_value=None,
            help=_('Custom size of the document. Use the form widthxheight '
                   'EG. `123x321` to specify the width and height (in pts). '
                   'This overrides any specified page-size.')),
        OptionRecommendation(
            name='extract_to',
            help=_(
                'Extract the contents of the generated %s file to the '
                'specified directory. The contents of the directory are first '
                'deleted, so be careful.') % 'DOCX'),
    }

    def convert(self, oeb, output_path, input_plugin, opts, log):
        from calibre.ebooks.docx.writer.container import DOCX
        from calibre.ebooks.docx.writer.from_html import Convert
        docx = DOCX(opts, log)
        Convert(oeb, docx)()
        docx.write(output_path, oeb)
        if opts.extract_to:
            from calibre.ebooks.docx.dump import do_dump
            do_dump(output_path, opts.extract_to)
Esempio n. 6
0
class TCROutput(OutputFormatPlugin):

    name = 'TCR Output'
    author = 'John Schember'
    file_type = 'tcr'
    commit_name = 'tcr_output'

    options = {
        OptionRecommendation(
            name='tcr_output_encoding',
            recommended_value='utf-8',
            level=OptionRecommendation.LOW,
            help=_('Specify the character encoding of the output document. '
                   'The default is utf-8.'))
    }

    def convert(self, oeb_book, output_path, input_plugin, opts, log):
        from calibre.ebooks.txt.txtml import TXTMLizer
        from calibre.ebooks.compression.tcr import compress

        close = False
        if not hasattr(output_path, 'write'):
            close = True
            if not os.path.exists(os.path.dirname(
                    output_path)) and os.path.dirname(output_path) != '':
                os.makedirs(os.path.dirname(output_path))
            out_stream = open(output_path, 'wb')
        else:
            out_stream = output_path

        setattr(opts, 'flush_paras', False)
        setattr(opts, 'max_line_length', 0)
        setattr(opts, 'force_max_line_length', False)
        setattr(opts, 'indent_paras', False)

        writer = TXTMLizer(log)
        txt = writer.extract_content(oeb_book,
                                     opts).encode(opts.tcr_output_encoding,
                                                  'replace')

        log.info('Compressing text...')
        txt = compress(txt)

        out_stream.seek(0)
        out_stream.truncate()
        out_stream.write(txt)

        if close:
            out_stream.close()
Esempio n. 7
0
class KindleComics(InputFormatPlugin):
    name = 'Kindle Comics Input'
    author = 'Pavel Zwerschke'
    supported_platforms = ['windows', 'osx', 'linux']
    file_types = {'cbz', 'cbr'}
    version = (0, 0, 2)
    description = 'Converts cbz and cbr files into a kindle format that is actually readable on Kindle devices.'

    minimum_calibre_version = (5, 0, 0)

    options = {
        OptionRecommendation(name='manga', recommended_value=False,
                             help='Used for right-to-left publications like manga.'),
        OptionRecommendation(name='webtoon', recommended_value=False,
                             help='Used for korean webtoons.'),
        OptionRecommendation(name='margins', choices=['auto', 'black', 'white'],
                             recommended_value='auto', help='What color should the margins have.'),
        OptionRecommendation(name='no_greyscale', recommended_value=False,
                             help='Don\'t convert the image to grayscale (black and white).'),
        OptionRecommendation(name='max_width', recommended_value="1264",
                             help='Maximum width.'),
        OptionRecommendation(name='max_height', recommended_value="1680",
                             help='Maximum height.'),
        OptionRecommendation(name='gamma', recommended_value="1.0",
                             help='Gamma correction. 0 means automatic.')
    }

    def gui_configuration_widget(self, parent, get_option_by_name, get_option_help, db, book_id=None):
        from calibre_plugins.kindle_comics.kindle_comics_input import PluginWidget
        return PluginWidget(parent, get_option_by_name, get_option_help, db, book_id)

    def convert(self, stream, options, file_ext, log, accelerators):
        from calibre_plugins.kindle_comics.make_book import make_book
        book = os.path.abspath(stream.name)
        stream.close()
        opt_file = make_book(_convert_options_to_dict(options), book, log)

        return opt_file
Esempio n. 8
0
class TXTInput(InputFormatPlugin):

    name = 'TXT Input'
    author = 'John Schember'
    description = 'Convert TXT files to HTML'
    file_types = {'txt', 'txtz', 'text', 'md', 'textile', 'markdown'}

    options = set([
        OptionRecommendation(
            name='paragraph_type',
            recommended_value='auto',
            choices=['auto', 'block', 'single', 'print', 'unformatted', 'off'],
            help=
            _('Paragraph structure.\n'
              'choices are [\'auto\', \'block\', \'single\', \'print\', \'unformatted\', \'off\']\n'
              '* auto: Try to auto detect paragraph type.\n'
              '* block: Treat a blank line as a paragraph break.\n'
              '* single: Assume every line is a paragraph.\n'
              '* print:  Assume every line starting with 2+ spaces or a tab '
              'starts a paragraph.\n'
              '* unformatted: Most lines have hard line breaks, few/no blank lines or indents. '
              'Tries to determine structure and reformat the differentiate elements.\n'
              '* off: Don\'t modify the paragraph structure. This is useful when combined with '
              'Markdown or Textile formatting to ensure no formatting is lost.'
              )),
        OptionRecommendation(
            name='formatting_type',
            recommended_value='auto',
            choices=['auto', 'plain', 'heuristic', 'textile', 'markdown'],
            help=
            _('Formatting used within the document.'
              '* auto: Automatically decide which formatting processor to use.\n'
              '* plain: Do not process the document formatting. Everything is a '
              'paragraph and no styling is applied.\n'
              '* heuristic: Process using heuristics to determine formatting such '
              'as chapter headings and italic text.\n'
              '* textile: Processing using textile formatting.\n'
              '* markdown: Processing using markdown formatting. '
              'To learn more about markdown see') +
            ' https://daringfireball.net/projects/markdown/'),
        OptionRecommendation(
            name='preserve_spaces',
            recommended_value=False,
            help=_('Normally extra spaces are condensed into a single space. '
                   'With this option all spaces will be displayed.')),
        OptionRecommendation(
            name='txt_in_remove_indents',
            recommended_value=False,
            help=_(
                'Normally extra space at the beginning of lines is retained. '
                'With this option they will be removed.')),
        OptionRecommendation(
            name="markdown_extensions",
            recommended_value='footnotes, tables, toc',
            help=
            _('Enable extensions to markdown syntax. Extensions are formatting that is not part '
              'of the standard markdown format. The extensions enabled by default: %default.\n'
              'To learn more about markdown extensions, see https://pythonhosted.org/Markdown/extensions/index.html\n'
              'This should be a comma separated list of extensions to enable:\n'
              ) + '\n'.join('* %s: %s' % (k, MD_EXTENSIONS[k])
                            for k in sorted(MD_EXTENSIONS))),
    ])

    def convert(self, stream, options, file_ext, log, accelerators):
        from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
        from calibre.ebooks.chardet import detect
        from calibre.utils.zipfile import ZipFile
        from calibre.ebooks.txt.processor import (
            convert_basic, convert_markdown_with_metadata,
            separate_paragraphs_single_line,
            separate_paragraphs_print_formatted, preserve_spaces,
            detect_paragraph_type, detect_formatting_type,
            normalize_line_endings, convert_textile, remove_indents,
            block_to_single_line, separate_hard_scene_breaks)

        self.log = log
        txt = ''
        log.debug('Reading text from file...')
        length = 0

        # Extract content from zip archive.
        if file_ext == 'txtz':
            zf = ZipFile(stream)
            zf.extractall('.')

            for x in walk('.'):
                if os.path.splitext(x)[1].lower() in ('.txt', '.text'):
                    with open(x, 'rb') as tf:
                        txt += tf.read() + '\n\n'
        else:
            txt = stream.read()
            if file_ext in {'md', 'textile', 'markdown'}:
                options.formatting_type = {
                    'md': 'markdown'
                }.get(file_ext, file_ext)
                log.info('File extension indicates particular formatting. '
                         'Forcing formatting type to: %s' %
                         options.formatting_type)
                options.paragraph_type = 'off'

        # Get the encoding of the document.
        if options.input_encoding:
            ienc = options.input_encoding
            log.debug('Using user specified input encoding of %s' % ienc)
        else:
            det_encoding = detect(txt)
            det_encoding, confidence = det_encoding['encoding'], det_encoding[
                'confidence']
            if det_encoding and det_encoding.lower().replace(
                    '_',
                    '-').strip() in ('gb2312', 'chinese', 'csiso58gb231280',
                                     'euc-cn', 'euccn', 'eucgb2312-cn',
                                     'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
                # Microsoft Word exports to HTML with encoding incorrectly set to
                # gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
                det_encoding = 'gbk'
            ienc = det_encoding
            log.debug(
                'Detected input encoding as %s with a confidence of %s%%' %
                (ienc, confidence * 100))
        if not ienc:
            ienc = 'utf-8'
            log.debug(
                'No input encoding specified and could not auto detect using %s'
                % ienc)
        # Remove BOM from start of txt as its presence can confuse markdown
        import codecs
        for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8,
                    codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE):
            if txt.startswith(bom):
                txt = txt[len(bom):]
                break
        txt = txt.decode(ienc, 'replace')

        # Replace entities
        txt = _ent_pat.sub(xml_entity_to_unicode, txt)

        # Normalize line endings
        txt = normalize_line_endings(txt)

        # Determine the paragraph type of the document.
        if options.paragraph_type == 'auto':
            options.paragraph_type = detect_paragraph_type(txt)
            if options.paragraph_type == 'unknown':
                log.debug(
                    'Could not reliably determine paragraph type using block')
                options.paragraph_type = 'block'
            else:
                log.debug('Auto detected paragraph type as %s' %
                          options.paragraph_type)

        # Detect formatting
        if options.formatting_type == 'auto':
            options.formatting_type = detect_formatting_type(txt)
            log.debug('Auto detected formatting as %s' %
                      options.formatting_type)

        if options.formatting_type == 'heuristic':
            setattr(options, 'enable_heuristics', True)
            setattr(options, 'unwrap_lines', False)
            setattr(options, 'smarten_punctuation', True)

        # Reformat paragraphs to block formatting based on the detected type.
        # We don't check for block because the processor assumes block.
        # single and print at transformed to block for processing.
        if options.paragraph_type == 'single':
            txt = separate_paragraphs_single_line(txt)
        elif options.paragraph_type == 'print':
            txt = separate_hard_scene_breaks(txt)
            txt = separate_paragraphs_print_formatted(txt)
            txt = block_to_single_line(txt)
        elif options.paragraph_type == 'unformatted':
            from calibre.ebooks.conversion.utils import HeuristicProcessor
            # unwrap lines based on punctuation
            docanalysis = DocAnalysis('txt', txt)
            length = docanalysis.line_length(.5)
            preprocessor = HeuristicProcessor(options,
                                              log=getattr(self, 'log', None))
            txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
            txt = separate_paragraphs_single_line(txt)
        elif options.paragraph_type == 'block':
            txt = separate_hard_scene_breaks(txt)
            txt = block_to_single_line(txt)

        if getattr(options, 'enable_heuristics', False) and getattr(
                options, 'dehyphenate', False):
            docanalysis = DocAnalysis('txt', txt)
            if not length:
                length = docanalysis.line_length(.5)
            dehyphenator = Dehyphenator(options.verbose, log=self.log)
            txt = dehyphenator(txt, 'txt', length)

        # User requested transformation on the text.
        if options.txt_in_remove_indents:
            txt = remove_indents(txt)

        # Preserve spaces will replace multiple spaces to a space
        # followed by the   entity.
        if options.preserve_spaces:
            txt = preserve_spaces(txt)

        # Process the text using the appropriate text processor.
        html = ''
        input_mi = None
        if options.formatting_type == 'markdown':
            log.debug('Running text through markdown conversion...')
            try:
                input_mi, html = convert_markdown_with_metadata(
                    txt,
                    extensions=[
                        x.strip()
                        for x in options.markdown_extensions.split(',')
                        if x.strip()
                    ])
            except RuntimeError:
                raise ValueError(
                    'This txt file has malformed markup, it cannot be'
                    ' converted by calibre. See https://daringfireball.net/projects/markdown/syntax'
                )
        elif options.formatting_type == 'textile':
            log.debug('Running text through textile conversion...')
            html = convert_textile(txt)
        else:
            log.debug('Running text through basic conversion...')
            flow_size = getattr(options, 'flow_size', 0)
            html = convert_basic(txt, epub_split_size_kb=flow_size)

        # Run the HTMLized text through the html processing plugin.
        from calibre.customize.ui import plugin_for_input_format
        html_input = plugin_for_input_format('html')
        for opt in html_input.options:
            setattr(options, opt.option.name, opt.recommended_value)
        options.input_encoding = 'utf-8'
        base = os.getcwdu()
        if file_ext != 'txtz' and hasattr(stream, 'name'):
            base = os.path.dirname(stream.name)
        fname = os.path.join(base, 'index.html')
        c = 0
        while os.path.exists(fname):
            c += 1
            fname = 'index%d.html' % c
        htmlfile = open(fname, 'wb')
        with htmlfile:
            htmlfile.write(html.encode('utf-8'))
        odi = options.debug_pipeline
        options.debug_pipeline = None
        # Generate oeb from html conversion.
        oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html',
                                 log, {})
        options.debug_pipeline = odi
        os.remove(htmlfile.name)

        # Set metadata from file.
        if input_mi is None:
            from calibre.customize.ui import get_file_type_metadata
            input_mi = get_file_type_metadata(stream, file_ext)
        from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
        meta_info_to_oeb_metadata(input_mi, oeb.metadata, log)
        self.html_postprocess_title = input_mi.title

        return oeb

    def postprocess_book(self, oeb, opts, log):
        for item in oeb.spine:
            if hasattr(item.data, 'xpath'):
                for title in item.data.xpath('//*[local-name()="title"]'):
                    if title.text == _('Unknown'):
                        title.text = self.html_postprocess_title
Esempio n. 9
0
class PDFInput(InputFormatPlugin):

    name = 'PDF Input'
    author = 'Kovid Goyal and John Schember'
    description = _('Convert PDF files to HTML')
    file_types = {'pdf'}
    commit_name = 'pdf_input'

    options = {
        OptionRecommendation(
            name='no_images',
            recommended_value=False,
            help=_('Do not extract images from the document')),
        OptionRecommendation(
            name='unwrap_factor',
            recommended_value=0.45,
            help=_(
                'Scale used to determine the length at which a line should '
                'be unwrapped. Valid values are a decimal between 0 and 1. The '
                'default is 0.45, just below the median line length.')),
        OptionRecommendation(
            name='new_pdf_engine',
            recommended_value=False,
            help=_(
                'Use the new PDF conversion engine. Currently not operational.'
            ))
    }

    def convert_new(self, stream, accelerators):
        from calibre.ebooks.pdf.pdftohtml import pdftohtml
        from calibre.utils.cleantext import clean_ascii_chars
        from calibre.ebooks.pdf.reflow import PDFDocument

        pdftohtml(os.getcwd(), stream.name, self.opts.no_images, as_xml=True)
        with lopen('index.xml', 'rb') as f:
            xml = clean_ascii_chars(f.read())
        PDFDocument(xml, self.opts, self.log)
        return os.path.join(os.getcwd(), 'metadata.opf')

    def convert(self, stream, options, file_ext, log, accelerators):
        from calibre.ebooks.metadata.opf2 import OPFCreator
        from calibre.ebooks.pdf.pdftohtml import pdftohtml

        log.debug('Converting file to html...')
        # The main html file will be named index.html
        self.opts, self.log = options, log
        if options.new_pdf_engine:
            return self.convert_new(stream, accelerators)
        pdftohtml(os.getcwd(), stream.name, options.no_images)

        from calibre.ebooks.metadata.meta import get_metadata
        log.debug('Retrieving document metadata...')
        mi = get_metadata(stream, 'pdf')
        opf = OPFCreator(os.getcwd(), mi)

        manifest = [('index.html', None)]

        images = os.listdir(os.getcwd())
        images.remove('index.html')
        for i in images:
            manifest.append((i, None))
        log.debug('Generating manifest...')
        opf.create_manifest(manifest)

        opf.create_spine(['index.html'])
        log.debug('Rendering manifest...')
        with lopen('metadata.opf', 'wb') as opffile:
            opf.render(opffile)
        if os.path.exists('toc.ncx'):
            ncxid = opf.manifest.id_for_path('toc.ncx')
            if ncxid:
                with lopen('metadata.opf', 'r+b') as f:
                    raw = f.read().replace(
                        b'<spine', b'<spine toc="%s"' % as_bytes(ncxid))
                    f.seek(0)
                    f.write(raw)

        return os.path.join(os.getcwd(), 'metadata.opf')
Esempio n. 10
0
class EPUBOutput(OutputFormatPlugin):

    name = 'EPUB Output'
    author = 'Kovid Goyal'
    file_type = 'epub'

    options = set([
        OptionRecommendation(
            name='extract_to',
            help=_(
                'Extract the contents of the generated %s file to the '
                'specified directory. The contents of the directory are first '
                'deleted, so be careful.') % 'EPUB'),
        OptionRecommendation(
            name='dont_split_on_page_breaks',
            recommended_value=False,
            level=OptionRecommendation.LOW,
            help=_('Turn off splitting at page breaks. Normally, input '
                   'files are automatically split at every page break into '
                   'two files. This gives an output e-book that can be '
                   'parsed faster and with less resources. However, '
                   'splitting is slow and if your source file contains a '
                   'very large number of page breaks, you should turn off '
                   'splitting on page breaks.')),
        OptionRecommendation(
            name='flow_size',
            recommended_value=260,
            help=_(
                'Split all HTML files larger than this size (in KB). '
                'This is necessary as most EPUB readers cannot handle large '
                'file sizes. The default of %defaultKB is the size required '
                'for Adobe Digital Editions. Set to 0 to disable size based splitting.'
            )),
        OptionRecommendation(
            name='no_default_epub_cover',
            recommended_value=False,
            help=_(
                'Normally, if the input file has no cover and you don\'t'
                ' specify one, a default cover is generated with the title, '
                'authors, etc. This option disables the generation of this cover.'
            )),
        OptionRecommendation(
            name='no_svg_cover',
            recommended_value=False,
            help=_('Do not use SVG for the book cover. Use this option if '
                   'your EPUB is going to be used on a device that does not '
                   'support SVG, like the iPhone or the JetBook Lite. '
                   'Without this option, such devices will display the cover '
                   'as a blank page.')),
        OptionRecommendation(
            name='preserve_cover_aspect_ratio',
            recommended_value=False,
            help=
            _('When using an SVG cover, this option will cause the cover to scale '
              'to cover the available screen area, but still preserve its aspect ratio '
              '(ratio of width to height). That means there may be white borders '
              'at the sides or top and bottom of the image, but the image will '
              'never be distorted. Without this option the image may be slightly '
              'distorted, but there will be no borders.')),
        OptionRecommendation(
            name='epub_flatten',
            recommended_value=False,
            help=_(
                'This option is needed only if you intend to use the EPUB'
                ' with FBReaderJ. It will flatten the file system inside the'
                ' EPUB, putting all files into the top level.')),
        OptionRecommendation(
            name='epub_inline_toc',
            recommended_value=False,
            help=
            _('Insert an inline Table of Contents that will appear as part of the main book content.'
              )),
        OptionRecommendation(
            name='epub_toc_at_end',
            recommended_value=False,
            help=
            _('Put the inserted inline Table of Contents at the end of the book instead of the start.'
              )),
        OptionRecommendation(
            name='toc_title',
            recommended_value=None,
            help=_('Title for any generated in-line table of contents.')),
    ])

    recommendations = set([('pretty_print', True, OptionRecommendation.HIGH)])

    def workaround_webkit_quirks(self):  # {{{
        from calibre.ebooks.oeb.base import XPath
        for x in self.oeb.spine:
            root = x.data
            body = XPath('//h:body')(root)
            if body:
                body = body[0]

            if not hasattr(body, 'xpath'):
                continue

            for pre in XPath('//h:pre')(body):
                if not pre.text and len(pre) == 0:
                    pre.tag = 'div'

    # }}}

    def upshift_markup(self):  # {{{
        'Upgrade markup to comply with XHTML 1.1 where possible'
        from calibre.ebooks.oeb.base import XPath, XML
        for x in self.oeb.spine:
            root = x.data
            if (not root.get(XML('lang'))) and (root.get('lang')):
                root.set(XML('lang'), root.get('lang'))
            body = XPath('//h:body')(root)
            if body:
                body = body[0]

            if not hasattr(body, 'xpath'):
                continue
            for u in XPath('//h:u')(root):
                u.tag = 'span'

            seen_ids, seen_names = set(), set()
            for x in XPath('//*[@id or @name]')(root):
                eid, name = x.get('id', None), x.get('name', None)
                if eid:
                    if eid in seen_ids:
                        del x.attrib['id']
                    else:
                        seen_ids.add(eid)
                if name:
                    if name in seen_names:
                        del x.attrib['name']
                    else:
                        seen_names.add(name)

    # }}}
    def convert(self, oeb, output_path, input_plugin, opts, log):
        self.log, self.opts, self.oeb = log, opts, oeb

        if self.opts.epub_inline_toc:
            from calibre.ebooks.mobi.writer8.toc import TOCAdder
            opts.mobi_toc_at_start = not opts.epub_toc_at_end
            opts.mobi_passthrough = False
            opts.no_inline_toc = False
            TOCAdder(oeb,
                     opts,
                     replace_previous_inline_toc=True,
                     ignore_existing_toc=True)

        if self.opts.epub_flatten:
            from calibre.ebooks.oeb.transforms.filenames import FlatFilenames
            FlatFilenames()(oeb, opts)
        else:
            from calibre.ebooks.oeb.transforms.filenames import UniqueFilenames
            UniqueFilenames()(oeb, opts)

        self.workaround_ade_quirks()
        self.workaround_webkit_quirks()
        self.upshift_markup()
        from calibre.ebooks.oeb.transforms.rescale import RescaleImages
        RescaleImages(check_colorspaces=True)(oeb, opts)

        from calibre.ebooks.oeb.transforms.split import Split
        split = Split(not self.opts.dont_split_on_page_breaks,
                      max_flow_size=self.opts.flow_size * 1024)
        split(self.oeb, self.opts)

        from calibre.ebooks.oeb.transforms.cover import CoverManager
        cm = CoverManager(
            no_default_cover=self.opts.no_default_epub_cover,
            no_svg_cover=self.opts.no_svg_cover,
            preserve_aspect_ratio=self.opts.preserve_cover_aspect_ratio)
        cm(self.oeb, self.opts, self.log)

        self.workaround_sony_quirks()

        if self.oeb.toc.count() == 0:
            self.log.warn('This EPUB file has no Table of Contents. '
                          'Creating a default TOC')
            first = next(iter(self.oeb.spine))
            self.oeb.toc.add(_('Start'), first.href)

        from calibre.ebooks.oeb.base import OPF
        identifiers = oeb.metadata['identifier']
        uuid = None
        for x in identifiers:
            if x.get(OPF('scheme'),
                     None).lower() == 'uuid' or str(x).startswith('urn:uuid:'):
                uuid = str(x).split(':')[-1]
                break
        encrypted_fonts = getattr(input_plugin, 'encrypted_fonts', [])

        if uuid is None:
            self.log.warn('No UUID identifier found')
            from uuid import uuid4
            uuid = str(uuid4())
            oeb.metadata.add('identifier', uuid, scheme='uuid', id=uuid)

        if encrypted_fonts and not uuid.startswith('urn:uuid:'):
            # Apparently ADE requires this value to start with urn:uuid:
            # for some absurd reason, or it will throw a hissy fit and refuse
            # to use the obfuscated fonts.
            for x in identifiers:
                if str(x) == uuid:
                    x.content = 'urn:uuid:' + uuid

        with TemporaryDirectory('_epub_output') as tdir:
            from calibre.customize.ui import plugin_for_output_format
            metadata_xml = None
            extra_entries = []
            if self.is_periodical:
                if self.opts.output_profile.epub_periodical_format == 'sony':
                    from calibre.ebooks.epub.periodical import sony_metadata
                    metadata_xml, atom_xml = sony_metadata(oeb)
                    extra_entries = [('atom.xml', 'application/atom+xml',
                                      atom_xml)]
            oeb_output = plugin_for_output_format('oeb')
            oeb_output.convert(oeb, tdir, input_plugin, opts, log)
            opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0]
            self.condense_ncx([
                os.path.join(tdir, x) for x in os.listdir(tdir)
                if x.endswith('.ncx')
            ][0])
            encryption = None
            if encrypted_fonts:
                encryption = self.encrypt_fonts(encrypted_fonts, tdir, uuid)

            from calibre.ebooks.epub import initialize_container
            with initialize_container(output_path,
                                      os.path.basename(opf),
                                      extra_entries=extra_entries) as epub:
                epub.add_dir(tdir)
                if encryption is not None:
                    epub.writestr('META-INF/encryption.xml', encryption)
                if metadata_xml is not None:
                    epub.writestr('META-INF/metadata.xml',
                                  metadata_xml.encode('utf-8'))
            if opts.extract_to is not None:
                from calibre.utils.zipfile import ZipFile
                if os.path.exists(opts.extract_to):
                    if os.path.isdir(opts.extract_to):
                        shutil.rmtree(opts.extract_to)
                    else:
                        os.remove(opts.extract_to)
                os.mkdir(opts.extract_to)
                with ZipFile(output_path) as zf:
                    zf.extractall(path=opts.extract_to)
                self.log.info('EPUB extracted to', opts.extract_to)

    def encrypt_fonts(self, uris, tdir, uuid):  # {{{
        from binascii import unhexlify

        key = re.sub(r'[^a-fA-F0-9]', '', uuid)
        if len(key) < 16:
            raise ValueError('UUID identifier %r is invalid' % uuid)
        key = unhexlify((key + key)[:32])
        key = tuple(map(ord, key))
        paths = []
        with CurrentDir(tdir):
            paths = [os.path.join(*x.split('/')) for x in uris]
            uris = dict(list(zip(uris, paths)))
            fonts = []
            for uri in list(uris.keys()):
                path = uris[uri]
                if isinstance(path, str):
                    path = path.encode(filesystem_encoding)
                if not os.path.exists(path):
                    uris.pop(uri)
                    continue
                self.log.debug('Encrypting font:', uri)
                with open(path, 'r+b') as f:
                    data = f.read(1024)
                    if len(data) >= 1024:
                        f.seek(0)
                        for i in range(1024):
                            f.write(chr(ord(data[i]) ^ key[i % 16]))
                    else:
                        self.log.warn('Font', path, 'is invalid, ignoring')
                if not isinstance(uri, str):
                    uri = uri.decode('utf-8')
                fonts.append('''
                <enc:EncryptedData>
                    <enc:EncryptionMethod Algorithm="http://ns.adobe.com/pdf/enc#RC"/>
                    <enc:CipherData>
                    <enc:CipherReference URI="%s"/>
                    </enc:CipherData>
                </enc:EncryptedData>
                ''' % (uri.replace('"', '\\"')))
            if fonts:
                ans = '''<encryption
                    xmlns="urn:oasis:names:tc:opendocument:xmlns:container"
                    xmlns:enc="http://www.w3.org/2001/04/xmlenc#"
                    xmlns:deenc="http://ns.adobe.com/digitaleditions/enc">
                    '''
                ans += ('\n'.join(fonts)).encode('utf-8')
                ans += '\n</encryption>'
                return ans

    # }}}

    def condense_ncx(self, ncx_path):
        from lxml import etree
        if not self.opts.pretty_print:
            tree = etree.parse(ncx_path)
            for tag in tree.getroot().iter(tag=etree.Element):
                if tag.text:
                    tag.text = tag.text.strip()
                if tag.tail:
                    tag.tail = tag.tail.strip()
            compressed = etree.tostring(tree.getroot(), encoding='utf-8')
            open(ncx_path, 'wb').write(compressed)

    def workaround_ade_quirks(self):  # {{{
        '''
        Perform various markup transforms to get the output to render correctly
        in the quirky ADE.
        '''
        from calibre.ebooks.oeb.base import XPath, XHTML, barename, urlunquote

        stylesheet = self.oeb.manifest.main_stylesheet

        # ADE cries big wet tears when it encounters an invalid fragment
        # identifier in the NCX toc.
        frag_pat = re.compile(r'[-A-Za-z0-9_:.]+$')
        for node in self.oeb.toc.iter():
            href = getattr(node, 'href', None)
            if hasattr(href, 'partition'):
                base, _, frag = href.partition('#')
                frag = urlunquote(frag)
                if frag and frag_pat.match(frag) is None:
                    self.log.warn(
                        'Removing fragment identifier %r from TOC as Adobe Digital Editions cannot handle it'
                        % frag)
                    node.href = base

        for x in self.oeb.spine:
            root = x.data
            body = XPath('//h:body')(root)
            if body:
                body = body[0]

            if hasattr(body, 'xpath'):
                # remove <img> tags with empty src elements
                bad = []
                for x in XPath('//h:img')(body):
                    src = x.get('src', '').strip()
                    if src in ('', '#') or src.startswith('http:'):
                        bad.append(x)
                for img in bad:
                    img.getparent().remove(img)

                # Add id attribute to <a> tags that have name
                for x in XPath('//h:a[@name]')(body):
                    if not x.get('id', False):
                        x.set('id', x.get('name'))
                    # The delightful epubcheck has started complaining about <a> tags that
                    # have name attributes.
                    x.attrib.pop('name')

                # Replace <br> that are children of <body> as ADE doesn't handle them
                for br in XPath('./h:br')(body):
                    if br.getparent() is None:
                        continue
                    try:
                        prior = next(br.itersiblings(preceding=True))
                        priortag = barename(prior.tag)
                        priortext = prior.tail
                    except:
                        priortag = 'body'
                        priortext = body.text
                    if priortext:
                        priortext = priortext.strip()
                    br.tag = XHTML('p')
                    br.text = '\u00a0'
                    style = br.get('style', '').split(';')
                    style = [_f for _f in [x.strip() for x in style] if _f]
                    style.append('margin:0pt; border:0pt')
                    # If the prior tag is a block (including a <br> we replaced)
                    # then this <br> replacement should have a 1-line height.
                    # Otherwise it should have no height.
                    if not priortext and priortag in block_level_tags:
                        style.append('height:1em')
                    else:
                        style.append('height:0pt')
                    br.set('style', '; '.join(style))

            for tag in XPath('//h:embed')(root):
                tag.getparent().remove(tag)
            for tag in XPath('//h:object')(root):
                if tag.get('type', '').lower().strip() in {
                        'image/svg+xml', 'application/svg+xml'
                }:
                    continue
                tag.getparent().remove(tag)

            for tag in XPath('//h:title|//h:style')(root):
                if not tag.text:
                    tag.getparent().remove(tag)
            for tag in XPath('//h:script')(root):
                if (not tag.text and not tag.get('src', False)
                        and tag.get('type', None) != 'text/x-mathjax-config'):
                    tag.getparent().remove(tag)
            for tag in XPath('//h:body/descendant::h:script')(root):
                tag.getparent().remove(tag)

            formchildren = XPath('./h:input|./h:button|./h:textarea|'
                                 './h:label|./h:fieldset|./h:legend')
            for tag in XPath('//h:form')(root):
                if formchildren(tag):
                    tag.getparent().remove(tag)
                else:
                    # Not a real form
                    tag.tag = XHTML('div')

            for tag in XPath('//h:center')(root):
                tag.tag = XHTML('div')
                tag.set('style', 'text-align:center')
            # ADE can't handle &amp; in an img url
            for tag in XPath('//h:img[@src]')(root):
                tag.set('src', tag.get('src', '').replace('&', ''))

            # ADE whimpers in fright when it encounters a <td> outside a
            # <table>
            in_table = XPath('ancestor::h:table')
            for tag in XPath('//h:td|//h:tr|//h:th')(root):
                if not in_table(tag):
                    tag.tag = XHTML('div')

            # ADE fails to render non breaking hyphens/soft hyphens/zero width spaces
            special_chars = re.compile('[\u200b\u00ad]')
            for elem in root.iterdescendants('*'):
                if elem.text:
                    elem.text = special_chars.sub('', elem.text)
                    elem.text = elem.text.replace('\u2011', '-')
                if elem.tail:
                    elem.tail = special_chars.sub('', elem.tail)
                    elem.tail = elem.tail.replace('\u2011', '-')

            if stylesheet is not None:
                # ADE doesn't render lists correctly if they have left margins
                from cssutils.css import CSSRule
                for lb in XPath('//h:ul[@class]|//h:ol[@class]')(root):
                    sel = '.' + lb.get('class')
                    for rule in stylesheet.data.cssRules.rulesOfType(
                            CSSRule.STYLE_RULE):
                        if sel == rule.selectorList.selectorText:
                            rule.style.removeProperty('margin-left')
                            # padding-left breaks rendering in webkit and gecko
                            rule.style.removeProperty('padding-left')
                # Change whitespace:pre to pre-wrap to accommodate readers that
                # cannot scroll horizontally
                for rule in stylesheet.data.cssRules.rulesOfType(
                        CSSRule.STYLE_RULE):
                    style = rule.style
                    ws = style.getPropertyValue('white-space')
                    if ws == 'pre':
                        style.setProperty('white-space', 'pre-wrap')

    # }}}

    def workaround_sony_quirks(self):  # {{{
        '''
        Perform toc link transforms to alleviate slow loading.
        '''
        from calibre.ebooks.oeb.base import urldefrag, XPath
        from calibre.ebooks.oeb.polish.toc import item_at_top

        def frag_is_at_top(root, frag):
            elem = XPath('//*[@id="%s" or @name="%s"]' % (frag, frag))(root)
            if elem:
                elem = elem[0]
            else:
                return False
            return item_at_top(elem)

        def simplify_toc_entry(toc):
            if toc.href:
                href, frag = urldefrag(toc.href)
                if frag:
                    for x in self.oeb.spine:
                        if x.href == href:
                            if frag_is_at_top(x.data, frag):
                                self.log.debug(
                                    'Removing anchor from TOC href:',
                                    href + '#' + frag)
                                toc.href = href
                            break
            for x in toc:
                simplify_toc_entry(x)

        if self.oeb.toc:
            simplify_toc_entry(self.oeb.toc)
Esempio n. 11
0
class FB2Input(InputFormatPlugin):

    name = 'FB2 Input'
    author = 'Anatoly Shipitsin'
    description = 'Convert FB2 files to HTML'
    file_types = set(['fb2'])

    recommendations = set([
        ('level1_toc', '//h:h1', OptionRecommendation.MED),
        ('level2_toc', '//h:h2', OptionRecommendation.MED),
        ('level3_toc', '//h:h3', OptionRecommendation.MED),
    ])

    options = set([
        OptionRecommendation(
            name='no_inline_fb2_toc',
            recommended_value=False,
            level=OptionRecommendation.LOW,
            help=
            _('Do not insert a Table of Contents at the beginning of the book.'
              )),
    ])

    def convert(self, stream, options, file_ext, log, accelerators):
        from lxml import etree
        from calibre.ebooks.metadata.fb2 import ensure_namespace
        from calibre.ebooks.metadata.opf2 import OPFCreator
        from calibre.ebooks.metadata.meta import get_metadata
        from calibre.ebooks.oeb.base import XLINK_NS, XHTML_NS, RECOVER_PARSER
        from calibre.ebooks.chardet import xml_to_unicode
        self.log = log
        log.debug('Parsing XML...')
        raw = stream.read().replace('\0', '')
        raw = xml_to_unicode(raw,
                             strip_encoding_pats=True,
                             assume_utf8=True,
                             resolve_entities=True)[0]
        try:
            doc = etree.fromstring(raw)
        except etree.XMLSyntaxError:
            try:
                doc = etree.fromstring(raw, parser=RECOVER_PARSER)
                if doc is None:
                    raise Exception('parse failed')
            except:
                doc = etree.fromstring(raw.replace('& ', '&amp;'),
                                       parser=RECOVER_PARSER)
        if doc is None:
            raise ValueError('The FB2 file is not valid XML')
        doc = ensure_namespace(doc)
        try:
            fb_ns = doc.nsmap[doc.prefix]
        except Exception:
            fb_ns = FB2NS

        NAMESPACES = {'f': fb_ns, 'l': XLINK_NS}
        stylesheets = doc.xpath(
            '//*[local-name() = "stylesheet" and @type="text/css"]')
        css = ''
        for s in stylesheets:
            css += etree.tostring(
                s, encoding=str, method='text', with_tail=False) + '\n\n'
        if css:
            import cssutils, logging
            parser = cssutils.CSSParser(fetcher=None,
                                        log=logging.getLogger('calibre.css'))

            XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % XHTML_NS
            text = XHTML_CSS_NAMESPACE + css
            log.debug('Parsing stylesheet...')
            stylesheet = parser.parseString(text)
            stylesheet.namespaces['h'] = XHTML_NS
            css = str(stylesheet.cssText).replace('h|style', 'h|span')
            css = re.sub(r'name\s*=\s*', 'class=', css)
        self.extract_embedded_content(doc)
        log.debug('Converting XML to HTML...')
        ss = open(P('templates/fb2.xsl'), 'rb').read()
        ss = ss.replace("__FB_NS__", fb_ns)
        if options.no_inline_fb2_toc:
            log('Disabling generation of inline FB2 TOC')
            ss = re.compile(r'<!-- BUILD TOC -->.*<!-- END BUILD TOC -->',
                            re.DOTALL).sub('', ss)

        styledoc = etree.fromstring(ss)

        transform = etree.XSLT(styledoc)
        result = transform(doc)

        # Handle links of type note and cite
        notes = {
            a.get('href')[1:]: a
            for a in result.xpath('//a[@link_note and @href]')
            if a.get('href').startswith('#')
        }
        cites = {
            a.get('link_cite'): a
            for a in result.xpath('//a[@link_cite]') if not a.get('href', '')
        }
        all_ids = {x for x in result.xpath('//*/@id')}
        for cite, a in cites.items():
            note = notes.get(cite, None)
            if note:
                c = 1
                while 'cite%d' % c in all_ids:
                    c += 1
                if not note.get('id', None):
                    note.set('id', 'cite%d' % c)
                    all_ids.add(note.get('id'))
                a.set('href', '#%s' % note.get('id'))
        for x in result.xpath('//*[@link_note or @link_cite]'):
            x.attrib.pop('link_note', None)
            x.attrib.pop('link_cite', None)

        for img in result.xpath('//img[@src]'):
            src = img.get('src')
            img.set('src', self.binary_map.get(src, src))
        index = transform.tostring(result)
        open('index.xhtml', 'wb').write(index)
        open('inline-styles.css', 'wb').write(css)
        stream.seek(0)
        mi = get_metadata(stream, 'fb2')
        if not mi.title:
            mi.title = _('Unknown')
        if not mi.authors:
            mi.authors = [_('Unknown')]
        cpath = None
        if mi.cover_data and mi.cover_data[1]:
            with open('fb2_cover_calibre_mi.jpg', 'wb') as f:
                f.write(mi.cover_data[1])
            cpath = os.path.abspath('fb2_cover_calibre_mi.jpg')
        else:
            for img in doc.xpath('//f:coverpage/f:image',
                                 namespaces=NAMESPACES):
                href = img.get('{%s}href' % XLINK_NS, img.get('href', None))
                if href is not None:
                    if href.startswith('#'):
                        href = href[1:]
                    cpath = os.path.abspath(href)
                    break

        opf = OPFCreator(os.getcwd(), mi)
        entries = [(f2, guess_type(f2)[0]) for f2 in os.listdir('.')]
        opf.create_manifest(entries)
        opf.create_spine(['index.xhtml'])
        if cpath:
            opf.guide.set_cover(cpath)
        with open('metadata.opf', 'wb') as f:
            opf.render(f)
        return os.path.join(os.getcwd(), 'metadata.opf')

    def extract_embedded_content(self, doc):
        from calibre.ebooks.fb2 import base64_decode
        self.binary_map = {}
        for elem in doc.xpath('./*'):
            if elem.text and 'binary' in elem.tag and 'id' in elem.attrib:
                ct = elem.get('content-type', '')
                fname = elem.attrib['id']
                ext = ct.rpartition('/')[-1].lower()
                if ext in ('png', 'jpeg', 'jpg'):
                    if fname.lower().rpartition('.')[-1] not in {
                            'jpg', 'jpeg', 'png'
                    }:
                        fname += '.' + ext
                    self.binary_map[elem.get('id')] = fname
                raw = elem.text.strip()
                try:
                    data = base64_decode(raw)
                except TypeError:
                    self.log.exception(
                        'Binary data with id=%s is corrupted, ignoring' %
                        (elem.get('id')))
                else:
                    with open(fname, 'wb') as f:
                        f.write(data)
Esempio n. 12
0
class PDFOutput(OutputFormatPlugin):

    name = 'PDF Output'
    author = 'Kovid Goyal'
    file_type = 'pdf'

    options = set([
        OptionRecommendation(
            name='override_profile_size',
            recommended_value=False,
            help=_(
                'Normally, the PDF page size is set by the output profile'
                ' chosen under the page setup options. This option will cause the '
                ' page size settings under PDF Output to override the '
                ' size specified by the output profile.')),
        OptionRecommendation(
            name='unit',
            recommended_value='inch',
            level=OptionRecommendation.LOW,
            short_switch='u',
            choices=UNITS,
            help=_(
                'The unit of measure for page sizes. Default is inch. Choices '
                'are %s '
                'Note: This does not override the unit for margins!') % UNITS),
        OptionRecommendation(
            name='paper_size',
            recommended_value='letter',
            level=OptionRecommendation.LOW,
            choices=PAPER_SIZES,
            help=
            _('The size of the paper. This size will be overridden when a '
              'non default output profile is used. Default is letter. Choices '
              'are %s') % PAPER_SIZES),
        OptionRecommendation(
            name='custom_size',
            recommended_value=None,
            help=_('Custom size of the document. Use the form widthxheight '
                   'e.g. `123x321` to specify the width and height. '
                   'This overrides any specified paper-size.')),
        OptionRecommendation(
            name='preserve_cover_aspect_ratio',
            recommended_value=False,
            help=_('Preserve the aspect ratio of the cover, instead'
                   ' of stretching it to fill the full first page of the'
                   ' generated pdf.')),
        OptionRecommendation(
            name='pdf_serif_family',
            recommended_value='Liberation Serif'
            if islinux else 'Times New Roman',
            help=_('The font family used to render serif fonts')),
        OptionRecommendation(
            name='pdf_sans_family',
            recommended_value='Liberation Sans' if islinux else 'Helvetica',
            help=_('The font family used to render sans-serif fonts')),
        OptionRecommendation(
            name='pdf_mono_family',
            recommended_value='Liberation Mono' if islinux else 'Courier New',
            help=_('The font family used to render monospaced fonts')),
        OptionRecommendation(
            name='pdf_standard_font',
            choices=['serif', 'sans', 'mono'],
            recommended_value='serif',
            help=_('The font family used to render monospaced fonts')),
        OptionRecommendation(name='pdf_default_font_size',
                             recommended_value=20,
                             help=_('The default font size')),
        OptionRecommendation(
            name='pdf_mono_font_size',
            recommended_value=16,
            help=_('The default font size for monospaced text')),
        OptionRecommendation(
            name='pdf_mark_links',
            recommended_value=False,
            help=_(
                'Surround all links with a red box, useful for debugging.')),
        OptionRecommendation(
            name='old_pdf_engine',
            recommended_value=False,
            help=_('Use the old, less capable engine to generate the PDF')),
        OptionRecommendation(
            name='uncompressed_pdf',
            recommended_value=False,
            help=_('Generate an uncompressed PDF, useful for debugging, '
                   'only works with the new PDF engine.')),
        OptionRecommendation(
            name='pdf_page_numbers',
            recommended_value=False,
            help=_(
                'Add page numbers to the bottom of every page in the generated PDF file. If you '
                'specify a footer template, it will take precedence '
                'over this option.')),
        OptionRecommendation(
            name='pdf_footer_template',
            recommended_value=None,
            help=
            _('An HTML template used to generate %s on every page.'
              ' The strings _PAGENUM_, _TITLE_, _AUTHOR_ and _SECTION_ will be replaced by their current values.'
              ) % _('footers')),
        OptionRecommendation(
            name='pdf_header_template',
            recommended_value=None,
            help=
            _('An HTML template used to generate %s on every page.'
              ' The strings _PAGENUM_, _TITLE_, _AUTHOR_ and _SECTION_ will be replaced by their current values.'
              ) % _('headers')),
        OptionRecommendation(
            name='pdf_add_toc',
            recommended_value=False,
            help=
            _('Add a Table of Contents at the end of the PDF that lists page numbers. '
              'Useful if you want to print out the PDF. If this PDF is intended for electronic use, use the PDF Outline instead.'
              )),
        OptionRecommendation(name='toc_title',
                             recommended_value=None,
                             help=_('Title for generated table of contents.')),
    ])

    def convert(self, oeb_book, output_path, input_plugin, opts, log):
        from calibre.gui2 import must_use_qt, load_builtin_fonts
        must_use_qt()
        load_builtin_fonts()

        self.oeb = oeb_book
        self.input_plugin, self.opts, self.log = input_plugin, opts, log
        self.output_path = output_path
        from calibre.ebooks.oeb.base import OPF, OPF2_NS
        from lxml import etree
        from io import BytesIO
        package = etree.Element(OPF('package'),
                                attrib={
                                    'version': '2.0',
                                    'unique-identifier': 'dummy'
                                },
                                nsmap={None: OPF2_NS})
        from calibre.ebooks.metadata.opf2 import OPF
        self.oeb.metadata.to_opf2(package)
        self.metadata = OPF(BytesIO(
            etree.tostring(package))).to_book_metadata()
        self.cover_data = None

        if input_plugin.is_image_collection:
            log.debug('Converting input as an image collection...')
            self.convert_images(input_plugin.get_images())
        else:
            log.debug('Converting input as a text based book...')
            self.convert_text(oeb_book)

    def convert_images(self, images):
        from calibre.ebooks.pdf.writer import ImagePDFWriter
        self.write(ImagePDFWriter, images, None)

    def get_cover_data(self):
        oeb = self.oeb
        if (oeb.metadata.cover
                and unicode(oeb.metadata.cover[0]) in oeb.manifest.ids):
            cover_id = unicode(oeb.metadata.cover[0])
            item = oeb.manifest.ids[cover_id]
            self.cover_data = item.data

    def handle_embedded_fonts(self):
        ''' On windows, Qt uses GDI which does not support OpenType
        (CFF) fonts, so we need to nuke references to OpenType
        fonts. Qt's directwrite text backend is not mature.
        Also make sure all fonts are embeddable. '''
        from calibre.ebooks.oeb.base import urlnormalize
        from calibre.utils.fonts.utils import remove_embed_restriction
        from PyQt5.Qt import QByteArray, QRawFont

        font_warnings = set()
        processed = set()
        is_cff = {}
        for item in list(self.oeb.manifest):
            if not hasattr(item.data, 'cssRules'):
                continue
            remove = set()
            for i, rule in enumerate(item.data.cssRules):
                if rule.type == rule.FONT_FACE_RULE:
                    try:
                        s = rule.style
                        src = s.getProperty('src').propertyValue[0].uri
                    except:
                        continue
                    path = item.abshref(src)
                    ff = self.oeb.manifest.hrefs.get(urlnormalize(path), None)
                    if ff is None:
                        continue

                    raw = nraw = ff.data
                    if path not in processed:
                        processed.add(path)
                        try:
                            nraw = remove_embed_restriction(raw)
                        except:
                            continue
                        if nraw != raw:
                            ff.data = nraw
                            self.oeb.container.write(path, nraw)

                    if iswindows:
                        if path not in is_cff:
                            f = QRawFont(QByteArray(nraw), 12)
                            is_cff[path] = f.isValid() and len(
                                f.fontTable('head')) == 0
                        if is_cff[path]:
                            if path not in font_warnings:
                                font_warnings.add(path)
                                self.log.warn(
                                    'CFF OpenType fonts are not supported on windows, ignoring: %s'
                                    % path)
                            remove.add(i)
            for i in sorted(remove, reverse=True):
                item.data.cssRules.pop(i)

    def convert_text(self, oeb_book):
        from calibre.ebooks.metadata.opf2 import OPF
        if self.opts.old_pdf_engine:
            from calibre.ebooks.pdf.writer import PDFWriter
            PDFWriter
        else:
            from calibre.ebooks.pdf.render.from_html import PDFWriter

        self.log.debug('Serializing oeb input to disk for processing...')
        self.get_cover_data()

        self.handle_embedded_fonts()

        with TemporaryDirectory('_pdf_out') as oeb_dir:
            from calibre.customize.ui import plugin_for_output_format
            oeb_output = plugin_for_output_format('oeb')
            oeb_output.convert(oeb_book, oeb_dir, self.input_plugin, self.opts,
                               self.log)

            opfpath = glob.glob(os.path.join(oeb_dir, '*.opf'))[0]
            opf = OPF(opfpath, os.path.dirname(opfpath))

            self.write(PDFWriter, [s.path for s in opf.spine],
                       getattr(opf, 'toc', None))

    def write(self, Writer, items, toc):
        writer = Writer(self.opts,
                        self.log,
                        cover_data=self.cover_data,
                        toc=toc)
        writer.report_progress = self.report_progress

        close = False
        if not hasattr(self.output_path, 'write'):
            close = True
            if not os.path.exists(os.path.dirname(
                    self.output_path)) and os.path.dirname(
                        self.output_path) != '':
                os.makedirs(os.path.dirname(self.output_path))
            out_stream = open(self.output_path, 'wb')
        else:
            out_stream = self.output_path

        out_stream.seek(0)
        out_stream.truncate()
        self.log.debug('Rendering pages to PDF...')
        import time
        st = time.time()
        if False:
            import cProfile
            cProfile.runctx(
                'writer.dump(items, out_stream, PDFMetadata(self.metadata))',
                globals(), locals(), '/tmp/profile')
        else:
            writer.dump(items, out_stream, PDFMetadata(self.metadata))
        self.log('Rendered PDF in %g seconds:' % (time.time() - st))

        if close:
            out_stream.close()
Esempio n. 13
0
class PDFOutput(OutputFormatPlugin):

    name = 'PDF Output'
    author = 'Kovid Goyal'
    file_type = 'pdf'

    options = set([
        OptionRecommendation(name='use_profile_size', recommended_value=False,
            help=_('Instead of using the paper size specified in the PDF Output options,'
                   ' use a paper size corresponding to the current output profile.'
                   ' Useful if you want to generate a PDF for viewing on a specific device.')),
        OptionRecommendation(name='unit', recommended_value='inch',
            level=OptionRecommendation.LOW, short_switch='u', choices=UNITS,
            help=_('The unit of measure for page sizes. Default is inch. Choices '
            'are %s '
            'Note: This does not override the unit for margins!') % UNITS),
        OptionRecommendation(name='paper_size', recommended_value='letter',
            level=OptionRecommendation.LOW, choices=PAPER_SIZES,
            help=_('The size of the paper. This size will be overridden when a '
            'non default output profile is used. Default is letter. Choices '
            'are %s') % PAPER_SIZES),
        OptionRecommendation(name='custom_size', recommended_value=None,
            help=_('Custom size of the document. Use the form widthxheight '
            'e.g. `123x321` to specify the width and height. '
            'This overrides any specified paper-size.')),
        OptionRecommendation(name='preserve_cover_aspect_ratio',
            recommended_value=False,
            help=_('Preserve the aspect ratio of the cover, instead'
                ' of stretching it to fill the full first page of the'
                ' generated pdf.')),
        OptionRecommendation(name='pdf_serif_family',
            recommended_value='Liberation Serif', help=_(
                'The font family used to render serif fonts')),
        OptionRecommendation(name='pdf_sans_family',
            recommended_value='Liberation Sans', help=_(
                'The font family used to render sans-serif fonts')),
        OptionRecommendation(name='pdf_mono_family',
            recommended_value='Liberation Mono', help=_(
                'The font family used to render monospace fonts')),
        OptionRecommendation(name='pdf_standard_font', choices=['serif',
            'sans', 'mono'],
            recommended_value='serif', help=_(
                'The font family used to render monospace fonts')),
        OptionRecommendation(name='pdf_default_font_size',
            recommended_value=20, help=_(
                'The default font size')),
        OptionRecommendation(name='pdf_mono_font_size',
            recommended_value=16, help=_(
                'The default font size for monospaced text')),
        OptionRecommendation(name='pdf_hyphenate', recommended_value=False,
            help=_('Break long words at the end of lines. This can give the text at the right margin a more even appearance.')),
        OptionRecommendation(name='pdf_mark_links', recommended_value=False,
            help=_('Surround all links with a red box, useful for debugging.')),
        OptionRecommendation(name='uncompressed_pdf',
            recommended_value=False, help=_(
                'Generate an uncompressed PDF, useful for debugging, '
                'only works with the new PDF engine.')),
        OptionRecommendation(name='pdf_page_numbers', recommended_value=False,
            help=_('Add page numbers to the bottom of every page in the generated PDF file. If you '
                   'specify a footer template, it will take precedence '
                   'over this option.')),
        OptionRecommendation(name='pdf_footer_template', recommended_value=None,
            help=_('An HTML template used to generate %s on every page.'
                   ' The strings _PAGENUM_, _TITLE_, _AUTHOR_ and _SECTION_ will be replaced by their current values.')%_('footers')),
        OptionRecommendation(name='pdf_header_template', recommended_value=None,
            help=_('An HTML template used to generate %s on every page.'
                   ' The strings _PAGENUM_, _TITLE_, _AUTHOR_ and _SECTION_ will be replaced by their current values.')%_('headers')),
        OptionRecommendation(name='pdf_add_toc', recommended_value=False,
            help=_('Add a Table of Contents at the end of the PDF that lists page numbers. '
                   'Useful if you want to print out the PDF. If this PDF is intended for electronic use, use the PDF Outline instead.')),
        OptionRecommendation(name='toc_title', recommended_value=None,
            help=_('Title for generated table of contents.')
        ),

        OptionRecommendation(name='pdf_page_margin_left', recommended_value=72.0,
            level=OptionRecommendation.LOW,
            help=_('The size of the left page margin, in pts. Default is 72pt.'
                   ' Overrides the common left page margin setting.')
        ),

        OptionRecommendation(name='pdf_page_margin_top', recommended_value=72.0,
            level=OptionRecommendation.LOW,
            help=_('The size of the top page margin, in pts. Default is 72pt.'
                   ' Overrides the common top page margin setting, unless set to zero.')
        ),

        OptionRecommendation(name='pdf_page_margin_right', recommended_value=72.0,
            level=OptionRecommendation.LOW,
            help=_('The size of the right page margin, in pts. Default is 72pt.'
                   ' Overrides the common right page margin setting, unless set to zero.')
        ),

        OptionRecommendation(name='pdf_page_margin_bottom', recommended_value=72.0,
            level=OptionRecommendation.LOW,
            help=_('The size of the bottom page margin, in pts. Default is 72pt.'
                   ' Overrides the common bottom page margin setting, unless set to zero.')
        ),
        OptionRecommendation(name='pdf_use_document_margins', recommended_value=False,
            help=_('Use the page margins specified in the input document via @page CSS rules.'
            ' This will cause the margins specified in the conversion settings to be ignored.'
            ' If the document does not specify page margins, the conversion settings will be used as a fallback.')
        ),
    ])

    def specialize_options(self, log, opts, input_fmt):
        if opts.pdf_use_document_margins:
            # Prevent the conversion pipeline from overwriting document margins
            opts.margin_left = opts.margin_right = opts.margin_top = opts.margin_bottom = -1

    def convert(self, oeb_book, output_path, input_plugin, opts, log):
        from calibre.gui2 import must_use_qt, load_builtin_fonts
        from calibre.ebooks.oeb.transforms.split import Split
        # Turn off hinting in WebKit (requires a patched build of QtWebKit)
        os.environ['CALIBRE_WEBKIT_NO_HINTING'] = '1'
        self.filtered_font_warnings = set()
        self.stored_page_margins = getattr(opts, '_stored_page_margins', {})
        try:
            # split on page breaks, as the JS code to convert page breaks to
            # column breaks will not work because of QWebSettings.LocalContentCanAccessFileUrls
            Split()(oeb_book, opts)
            must_use_qt()
            load_builtin_fonts()

            self.oeb = oeb_book
            self.input_plugin, self.opts, self.log = input_plugin, opts, log
            self.output_path = output_path
            from calibre.ebooks.oeb.base import OPF, OPF2_NS
            from lxml import etree
            from io import BytesIO
            package = etree.Element(OPF('package'),
                attrib={'version': '2.0', 'unique-identifier': 'dummy'},
                nsmap={None: OPF2_NS})
            from calibre.ebooks.metadata.opf2 import OPF
            self.oeb.metadata.to_opf2(package)
            self.metadata = OPF(BytesIO(etree.tostring(package))).to_book_metadata()
            self.cover_data = None

            if input_plugin.is_image_collection:
                log.debug('Converting input as an image collection...')
                self.convert_images(input_plugin.get_images())
            else:
                log.debug('Converting input as a text based book...')
                self.convert_text(oeb_book)
        finally:
            os.environ.pop('CALIBRE_WEBKIT_NO_HINTING', None)

    def convert_images(self, images):
        from calibre.ebooks.pdf.render.from_html import ImagePDFWriter
        self.write(ImagePDFWriter, images, None)

    def get_cover_data(self):
        oeb = self.oeb
        if (oeb.metadata.cover and unicode(oeb.metadata.cover[0]) in oeb.manifest.ids):
            cover_id = unicode(oeb.metadata.cover[0])
            item = oeb.manifest.ids[cover_id]
            self.cover_data = item.data

    def process_fonts(self):
        ''' Make sure all fonts are embeddable. Also remove some fonts that cause problems. '''
        from calibre.ebooks.oeb.base import urlnormalize
        from calibre.utils.fonts.utils import remove_embed_restriction

        processed = set()
        for item in list(self.oeb.manifest):
            if not hasattr(item.data, 'cssRules'):
                continue
            for i, rule in enumerate(item.data.cssRules):
                if rule.type == rule.FONT_FACE_RULE:
                    try:
                        s = rule.style
                        src = s.getProperty('src').propertyValue[0].uri
                    except:
                        continue
                    path = item.abshref(src)
                    ff = self.oeb.manifest.hrefs.get(urlnormalize(path), None)
                    if ff is None:
                        continue

                    raw = nraw = ff.data
                    if path not in processed:
                        processed.add(path)
                        try:
                            nraw = remove_embed_restriction(raw)
                        except:
                            continue
                        if nraw != raw:
                            ff.data = nraw
                            self.oeb.container.write(path, nraw)
                elif iswindows and rule.type == rule.STYLE_RULE:
                    from tinycss.fonts3 import parse_font_family, serialize_font_family
                    s = rule.style
                    f = s.getProperty(u'font-family')
                    if f is not None:
                        font_families = parse_font_family(f.propertyValue.cssText)
                        ff = [x for x in font_families if x.lower() != u'courier']
                        if len(ff) != len(font_families):
                            if 'courier' not in self.filtered_font_warnings:
                                # See https://bugs.launchpad.net/bugs/1665835
                                self.filtered_font_warnings.add(u'courier')
                                self.log.warn(u'Removing courier font family as it does not render on windows')
                            f.propertyValue.cssText = serialize_font_family(ff or [u'monospace'])

    def convert_text(self, oeb_book):
        from calibre.ebooks.metadata.opf2 import OPF
        from calibre.ebooks.pdf.render.from_html import PDFWriter

        self.log.debug('Serializing oeb input to disk for processing...')
        self.get_cover_data()

        self.process_fonts()
        if self.opts.pdf_use_document_margins and self.stored_page_margins:
            import json
            for href, margins in self.stored_page_margins.iteritems():
                item = oeb_book.manifest.hrefs.get(href)
                root = item.data
                if hasattr(root, 'xpath') and margins:
                    root.set('data-calibre-pdf-output-page-margins', json.dumps(margins))

        with TemporaryDirectory('_pdf_out') as oeb_dir:
            from calibre.customize.ui import plugin_for_output_format
            oeb_output = plugin_for_output_format('oeb')
            oeb_output.convert(oeb_book, oeb_dir, self.input_plugin, self.opts, self.log)

            opfpath = glob.glob(os.path.join(oeb_dir, '*.opf'))[0]
            opf = OPF(opfpath, os.path.dirname(opfpath))

            self.write(PDFWriter, [s.path for s in opf.spine], getattr(opf,
                'toc', None))

    def write(self, Writer, items, toc):
        writer = Writer(self.opts, self.log, cover_data=self.cover_data,
                toc=toc)
        writer.report_progress = self.report_progress

        close = False
        if not hasattr(self.output_path, 'write'):
            close = True
            if not os.path.exists(os.path.dirname(self.output_path)) and os.path.dirname(self.output_path) != '':
                os.makedirs(os.path.dirname(self.output_path))
            out_stream = open(self.output_path, 'wb')
        else:
            out_stream = self.output_path

        out_stream.seek(0)
        out_stream.truncate()
        self.log.debug('Rendering pages to PDF...')
        import time
        st = time.time()
        if False:
            import cProfile
            cProfile.runctx('writer.dump(items, out_stream, PDFMetadata(self.metadata))',
                        globals(), locals(), '/tmp/profile')
        else:
            writer.dump(items, out_stream, PDFMetadata(self.metadata))
        self.log('Rendered PDF in %g seconds:'%(time.time()-st))

        if close:
            out_stream.close()
Esempio n. 14
0
class RTFInput(InputFormatPlugin):

    name = 'RTF Input'
    author = 'Kovid Goyal'
    description = _('Convert RTF files to HTML')
    file_types = {'rtf'}
    commit_name = 'rtf_input'

    options = {
        OptionRecommendation(
            name='ignore_wmf',
            recommended_value=False,
            help=
            _('Ignore WMF images instead of replacing them with a placeholder image.'
              )),
    }

    def generate_xml(self, stream):
        from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf
        ofile = 'dataxml.xml'
        run_lev, debug_dir, indent_out = 1, None, 0
        if getattr(self.opts, 'debug_pipeline', None) is not None:
            try:
                os.mkdir('rtfdebug')
                debug_dir = 'rtfdebug'
                run_lev = 4
                indent_out = 1
                self.log('Running RTFParser in debug mode')
            except:
                self.log.warn('Impossible to run RTFParser in debug mode')
        parser = ParseRtf(
            in_file=stream,
            out_file=ofile,
            # Convert symbol fonts to unicode equivalents. Default
            # is 1
            convert_symbol=1,

            # Convert Zapf fonts to unicode equivalents. Default
            # is 1.
            convert_zapf=1,

            # Convert Wingding fonts to unicode equivalents.
            # Default is 1.
            convert_wingdings=1,

            # Convert RTF caps to real caps.
            # Default is 1.
            convert_caps=1,

            # Indent resulting XML.
            # Default is 0 (no indent).
            indent=indent_out,

            # Form lists from RTF. Default is 1.
            form_lists=1,

            # Convert headings to sections. Default is 0.
            headings_to_sections=1,

            # Group paragraphs with the same style name. Default is 1.
            group_styles=1,

            # Group borders. Default is 1.
            group_borders=1,

            # Write or do not write paragraphs. Default is 0.
            empty_paragraphs=1,

            # Debug
            deb_dir=debug_dir,

            # Default encoding
            default_encoding=getattr(self.opts, 'input_encoding', 'cp1252')
            or 'cp1252',

            # Run level
            run_level=run_lev,
        )
        parser.parse_rtf()
        with open(ofile, 'rb') as f:
            return f.read()

    def extract_images(self, picts):
        from calibre.utils.imghdr import what
        from binascii import unhexlify
        self.log('Extracting images...')

        with open(picts, 'rb') as f:
            raw = f.read()
        picts = filter(len, re.findall(br'\{\\pict([^}]+)\}', raw))
        hex_pat = re.compile(br'[^a-fA-F0-9]')
        encs = [hex_pat.sub(b'', pict) for pict in picts]

        count = 0
        imap = {}
        for enc in encs:
            if len(enc) % 2 == 1:
                enc = enc[:-1]
            data = unhexlify(enc)
            fmt = what(None, data)
            if fmt is None:
                fmt = 'wmf'
            count += 1
            name = '%04d.%s' % (count, fmt)
            with open(name, 'wb') as f:
                f.write(data)
            imap[count] = name
            # with open(name+'.hex', 'wb') as f:
            #     f.write(enc)
        return self.convert_images(imap)

    def convert_images(self, imap):
        self.default_img = None
        for count, val in iteritems(imap):
            try:
                imap[count] = self.convert_image(val)
            except:
                self.log.exception('Failed to convert', val)
        return imap

    def convert_image(self, name):
        if not name.endswith('.wmf'):
            return name
        try:
            return self.rasterize_wmf(name)
        except Exception:
            self.log.exception('Failed to convert WMF image %r' % name)
        return self.replace_wmf(name)

    def replace_wmf(self, name):
        if self.opts.ignore_wmf:
            os.remove(name)
            return '__REMOVE_ME__'
        from calibre.ebooks.covers import message_image
        if self.default_img is None:
            self.default_img = message_image(
                'Conversion of WMF images is not supported.'
                ' Use Microsoft Word or OpenOffice to save this RTF file'
                ' as HTML and convert that in calibre.')
        name = name.replace('.wmf', '.jpg')
        with lopen(name, 'wb') as f:
            f.write(self.default_img)
        return name

    def rasterize_wmf(self, name):
        from calibre.utils.wmf.parse import wmf_unwrap
        with open(name, 'rb') as f:
            data = f.read()
        data = wmf_unwrap(data)
        name = name.replace('.wmf', '.png')
        with open(name, 'wb') as f:
            f.write(data)
        return name

    def write_inline_css(self, ic, border_styles):
        font_size_classes = [
            'span.fs%d { font-size: %spt }' % (i, x)
            for i, x in enumerate(ic.font_sizes)
        ]
        color_classes = [
            'span.col%d { color: %s }' % (i, x)
            for i, x in enumerate(ic.colors) if x != 'false'
        ]
        css = textwrap.dedent('''
        span.none {
            text-decoration: none; font-weight: normal;
            font-style: normal; font-variant: normal
        }

        span.italics { font-style: italic }

        span.bold { font-weight: bold }

        span.small-caps { font-variant: small-caps }

        span.underlined { text-decoration: underline }

        span.strike-through { text-decoration: line-through }

        ''')
        css += '\n' + '\n'.join(font_size_classes)
        css += '\n' + '\n'.join(color_classes)

        for cls, val in iteritems(border_styles):
            css += '\n\n.%s {\n%s\n}' % (cls, val)

        with open('styles.css', 'ab') as f:
            f.write(css.encode('utf-8'))

    def convert_borders(self, doc):
        border_styles = []
        style_map = {}
        for elem in doc.xpath(r'//*[local-name()="cell"]'):
            style = [
                'border-style: hidden', 'border-width: 1px',
                'border-color: black'
            ]
            for x in ('bottom', 'top', 'left', 'right'):
                bs = elem.get('border-cell-%s-style' % x, None)
                if bs:
                    cbs = border_style_map.get(bs, 'solid')
                    style.append('border-%s-style: %s' % (x, cbs))
                bw = elem.get('border-cell-%s-line-width' % x, None)
                if bw:
                    style.append('border-%s-width: %spt' % (x, bw))
                bc = elem.get('border-cell-%s-color' % x, None)
                if bc:
                    style.append('border-%s-color: %s' % (x, bc))
            style = ';\n'.join(style)
            if style not in border_styles:
                border_styles.append(style)
            idx = border_styles.index(style)
            cls = 'border_style%d' % idx
            style_map[cls] = style
            elem.set('class', cls)
        return style_map

    def convert(self, stream, options, file_ext, log, accelerators):
        from lxml import etree
        from calibre.ebooks.metadata.meta import get_metadata
        from calibre.ebooks.metadata.opf2 import OPFCreator
        from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
        from calibre.ebooks.rtf.input import InlineClass
        from calibre.utils.xml_parse import safe_xml_fromstring
        self.opts = options
        self.log = log
        self.log('Converting RTF to XML...')
        try:
            xml = self.generate_xml(stream.name)
        except RtfInvalidCodeException as e:
            self.log.exception('Unable to parse RTF')
            raise ValueError(
                _('This RTF file has a feature calibre does not '
                  'support. Convert it to HTML first and then try it.\n%s') %
                e)

        d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf'))
        if d:
            imap = {}
            try:
                imap = self.extract_images(d[0])
            except:
                self.log.exception('Failed to extract images...')

        self.log('Parsing XML...')
        doc = safe_xml_fromstring(xml)
        border_styles = self.convert_borders(doc)
        for pict in doc.xpath(
                '//rtf:pict[@num]',
                namespaces={'rtf': 'http://rtf2xml.sourceforge.net/'}):
            num = int(pict.get('num'))
            name = imap.get(num, None)
            if name is not None:
                pict.set('num', name)

        self.log('Converting XML to HTML...')
        inline_class = InlineClass(self.log)
        styledoc = safe_xml_fromstring(P('templates/rtf.xsl', data=True),
                                       recover=False)
        extensions = {('calibre', 'inline-class'): inline_class}
        transform = etree.XSLT(styledoc, extensions=extensions)
        result = transform(doc)
        html = 'index.xhtml'
        with open(html, 'wb') as f:
            res = as_bytes(transform.tostring(result))
            # res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
            # clean multiple \n
            res = re.sub(b'\n+', b'\n', res)
            # Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines
            # res = re.sub('\s*<body>', '<body>', res)
            # res = re.sub('(?<=\n)\n{2}',
            # u'<p>\u00a0</p>\n'.encode('utf-8'), res)
            f.write(res)
        self.write_inline_css(inline_class, border_styles)
        stream.seek(0)
        mi = get_metadata(stream, 'rtf')
        if not mi.title:
            mi.title = _('Unknown')
        if not mi.authors:
            mi.authors = [_('Unknown')]
        opf = OPFCreator(os.getcwd(), mi)
        opf.create_manifest([('index.xhtml', None)])
        opf.create_spine(['index.xhtml'])
        opf.render(open('metadata.opf', 'wb'))
        return os.path.abspath('metadata.opf')

    def postprocess_book(self, oeb, opts, log):
        for item in oeb.spine:
            for img in item.data.xpath(
                    '//*[local-name()="img" and @src="__REMOVE_ME__"]'):
                p = img.getparent()
                idx = p.index(img)
                p.remove(img)
                if img.tail:
                    if idx == 0:
                        p.text = (p.text or '') + img.tail
                    else:
                        p[idx - 1].tail = (p[idx - 1].tail or '') + img.tail
Esempio n. 15
0
class ComicInput(InputFormatPlugin):

    name = 'Comic Input'
    author = 'Kovid Goyal'
    description = _(
        'Optimize comic files (.cbz, .cbr, .cbc) for viewing on portable devices'
    )
    file_types = {'cbz', 'cbr', 'cb7', 'cbc'}
    is_image_collection = True
    commit_name = 'comic_input'
    core_usage = -1

    options = {
        OptionRecommendation(
            name='colors',
            recommended_value=0,
            help=
            _('Reduce the number of colors used in the image. This works only'
              ' if you choose the PNG output format. It is useful to reduce file sizes.'
              ' Set to zero to turn off. Maximum value is 256. It is off by default.'
              )),
        OptionRecommendation(
            name='dont_normalize',
            recommended_value=False,
            help=_('Disable normalize (improve contrast) color range '
                   'for pictures. Default: False')),
        OptionRecommendation(
            name='keep_aspect_ratio',
            recommended_value=False,
            help=_(
                'Maintain picture aspect ratio. Default is to fill the screen.'
            )),
        OptionRecommendation(name='dont_sharpen',
                             recommended_value=False,
                             help=_('Disable sharpening.')),
        OptionRecommendation(
            name='disable_trim',
            recommended_value=False,
            help=_('Disable trimming of comic pages. For some comics, '
                   'trimming might remove content as well as borders.')),
        OptionRecommendation(
            name='landscape',
            recommended_value=False,
            help=_("Don't split landscape images into two portrait images")),
        OptionRecommendation(
            name='wide',
            recommended_value=False,
            help=_("Keep aspect ratio and scale image using screen height as "
                   "image width for viewing in landscape mode.")),
        OptionRecommendation(
            name='right2left',
            recommended_value=False,
            help=_('Used for right-to-left publications like manga. '
                   'Causes landscape pages to be split into portrait pages '
                   'from right to left.')),
        OptionRecommendation(name='despeckle',
                             recommended_value=False,
                             help=_('Enable Despeckle. Reduces speckle noise. '
                                    'May greatly increase processing time.')),
        OptionRecommendation(
            name='no_sort',
            recommended_value=False,
            help=_("Don't sort the files found in the comic "
                   "alphabetically by name. Instead use the order they were "
                   "added to the comic.")),
        OptionRecommendation(
            name='output_format',
            choices=['png', 'jpg'],
            recommended_value='png',
            help=_(
                'The format that images in the created e-book '
                'are converted to. You can experiment to see which format gives '
                'you optimal size and look on your device.')),
        OptionRecommendation(name='no_process',
                             recommended_value=False,
                             help=_("Apply no processing to the image")),
        OptionRecommendation(
            name='dont_grayscale',
            recommended_value=False,
            help=_('Do not convert the image to grayscale (black and white)')),
        OptionRecommendation(
            name='comic_image_size',
            recommended_value=None,
            help=_(
                'Specify the image size as widthxheight pixels. Normally,'
                ' an image size is automatically calculated from the output '
                'profile, this option overrides it.')),
        OptionRecommendation(
            name='dont_add_comic_pages_to_toc',
            recommended_value=False,
            help=_(
                'When converting a CBC do not add links to each page to'
                ' the TOC. Note this only applies if the TOC has more than one'
                ' section')),
    }

    recommendations = {
        ('margin_left', 0, OptionRecommendation.HIGH),
        ('margin_top', 0, OptionRecommendation.HIGH),
        ('margin_right', 0, OptionRecommendation.HIGH),
        ('margin_bottom', 0, OptionRecommendation.HIGH),
        ('insert_blank_line', False, OptionRecommendation.HIGH),
        ('remove_paragraph_spacing', False, OptionRecommendation.HIGH),
        ('change_justification', 'left', OptionRecommendation.HIGH),
        ('dont_split_on_pagebreaks', True, OptionRecommendation.HIGH),
        ('chapter', None, OptionRecommendation.HIGH),
        ('page_breaks_brefore', None, OptionRecommendation.HIGH),
        ('use_auto_toc', False, OptionRecommendation.HIGH),
        ('page_breaks_before', None, OptionRecommendation.HIGH),
        ('disable_font_rescaling', True, OptionRecommendation.HIGH),
        ('linearize_tables', False, OptionRecommendation.HIGH),
    }

    def get_comics_from_collection(self, stream):
        from calibre.libunzip import extract as zipextract
        tdir = PersistentTemporaryDirectory('_comic_collection')
        zipextract(stream, tdir)
        comics = []
        with CurrentDir(tdir):
            if not os.path.exists('comics.txt'):
                raise ValueError(
                    ('%s is not a valid comic collection'
                     ' no comics.txt was found in the file') % stream.name)
            with open('comics.txt', 'rb') as f:
                raw = f.read()
            if raw.startswith(codecs.BOM_UTF16_BE):
                raw = raw.decode('utf-16-be')[1:]
            elif raw.startswith(codecs.BOM_UTF16_LE):
                raw = raw.decode('utf-16-le')[1:]
            elif raw.startswith(codecs.BOM_UTF8):
                raw = raw.decode('utf-8')[1:]
            else:
                raw = raw.decode('utf-8')
            for line in raw.splitlines():
                line = line.strip()
                if not line:
                    continue
                fname, title = line.partition(':')[0], line.partition(':')[-1]
                fname = fname.replace('#', '_')
                fname = os.path.join(tdir, *fname.split('/'))
                if not title:
                    title = os.path.basename(fname).rpartition('.')[0]
                if os.access(fname, os.R_OK):
                    comics.append([title, fname])
        if not comics:
            raise ValueError('%s has no comics' % stream.name)
        return comics

    def get_pages(self, comic, tdir2):
        from calibre.ebooks.comic.input import (extract_comic, process_pages,
                                                find_pages)
        tdir = extract_comic(comic)
        new_pages = find_pages(tdir,
                               sort_on_mtime=self.opts.no_sort,
                               verbose=self.opts.verbose)
        thumbnail = None
        if not new_pages:
            raise ValueError('Could not find any pages in the comic: %s' %
                             comic)
        if self.opts.no_process:
            n2 = []
            for i, page in enumerate(new_pages):
                n2.append(
                    os.path.join(tdir2,
                                 '{} - {}'.format(i, os.path.basename(page))))
                shutil.copyfile(page, n2[-1])
            new_pages = n2
        else:
            new_pages, failures = process_pages(new_pages, self.opts,
                                                self.report_progress, tdir2)
            if failures:
                self.log.warning('Could not process the following pages '
                                 '(run with --verbose to see why):')
                for f in failures:
                    self.log.warning('\t', f)
            if not new_pages:
                raise ValueError(
                    'Could not find any valid pages in comic: %s' % comic)
            thumbnail = os.path.join(
                tdir2, 'thumbnail.' + self.opts.output_format.lower())
            if not os.access(thumbnail, os.R_OK):
                thumbnail = None
        return new_pages

    def get_images(self):
        return self._images

    def convert(self, stream, opts, file_ext, log, accelerators):
        from calibre.ebooks.metadata import MetaInformation
        from calibre.ebooks.metadata.opf2 import OPFCreator
        from calibre.ebooks.metadata.toc import TOC

        self.opts, self.log = opts, log
        if file_ext == 'cbc':
            comics_ = self.get_comics_from_collection(stream)
        else:
            comics_ = [['Comic', os.path.abspath(stream.name)]]
        stream.close()
        comics = []
        num_pages_per_comic = []
        for i, x in enumerate(comics_):
            title, fname = x
            cdir = 'comic_%d' % (i + 1) if len(comics_) > 1 else '.'
            cdir = os.path.abspath(cdir)
            if not os.path.exists(cdir):
                os.makedirs(cdir)
            pages = self.get_pages(fname, cdir)
            if not pages:
                continue
            num_pages_per_comic.append(len(pages))
            if self.for_viewer:
                comics.append(
                    (title, pages, [self.create_viewer_wrapper(pages, cdir)]))
            else:
                wrappers = self.create_wrappers(pages)
                comics.append((title, pages, wrappers))

        if not comics:
            raise ValueError('No comic pages found in %s' % stream.name)

        mi = MetaInformation(
            os.path.basename(stream.name).rpartition('.')[0], [_('Unknown')])
        opf = OPFCreator(os.getcwd(), mi)
        entries = []

        def href(x):
            if len(comics) == 1:
                return os.path.basename(x)
            return '/'.join(x.split(os.sep)[-2:])

        cover_href = None
        for comic in comics:
            pages, wrappers = comic[1:]
            page_entries = [(x, None) for x in map(href, pages)]
            entries += [(w, None) for w in map(href, wrappers)] + page_entries
            if cover_href is None and page_entries:
                cover_href = page_entries[0][0]
        opf.create_manifest(entries)
        spine = []
        for comic in comics:
            spine.extend(map(href, comic[2]))
        self._images = []
        for comic in comics:
            self._images.extend(comic[1])
        opf.create_spine(spine)
        if self.for_viewer and cover_href:
            if os.path.isabs(cover_href):
                cover_href = os.path.relpath(cover_href).replace(os.sep, '/')
            opf.guide.set_cover(cover_href)
        toc = TOC()
        if len(comics) == 1:
            wrappers = comics[0][2]
            if self.for_viewer:
                wrapper_page_href = href(wrappers[0])
                for i in range(num_pages_per_comic[0]):
                    toc.add_item('{}#page_{}'.format(wrapper_page_href, i + 1),
                                 None,
                                 _('Page') + ' %d' % (i + 1),
                                 play_order=i)

            else:
                for i, x in enumerate(wrappers):
                    toc.add_item(href(x),
                                 None,
                                 _('Page') + ' %d' % (i + 1),
                                 play_order=i)
        else:
            po = 0
            for num_pages, comic in zip(num_pages_per_comic, comics):
                po += 1
                wrappers = comic[2]
                stoc = toc.add_item(href(wrappers[0]),
                                    None,
                                    comic[0],
                                    play_order=po)
                if not opts.dont_add_comic_pages_to_toc:
                    if self.for_viewer:
                        wrapper_page_href = href(wrappers[0])
                        for i in range(num_pages):
                            stoc.add_item('{}#page_{}'.format(
                                wrapper_page_href, i + 1),
                                          None,
                                          _('Page') + ' %d' % (i + 1),
                                          play_order=po)
                            po += 1
                    else:
                        for i, x in enumerate(wrappers):
                            stoc.add_item(href(x),
                                          None,
                                          _('Page') + ' %d' % (i + 1),
                                          play_order=po)
                            po += 1
        opf.set_toc(toc)
        with open('metadata.opf', 'wb') as m, open('toc.ncx', 'wb') as n:
            opf.render(m, n, 'toc.ncx')
        return os.path.abspath('metadata.opf')

    def create_wrappers(self, pages):
        from calibre.ebooks.oeb.base import XHTML_NS
        wrappers = []
        WRAPPER = textwrap.dedent('''\
        <html xmlns="%s">
            <head>
                <meta charset="utf-8"/>
                <title>Page #%d</title>
                <style type="text/css">
                    @page { margin:0pt; padding: 0pt}
                    body { margin: 0pt; padding: 0pt}
                    div { text-align: center }
                </style>
            </head>
            <body>
                <div>
                    <img src="%s" alt="comic page #%d" />
                </div>
            </body>
        </html>
        ''')
        dir = os.path.dirname(pages[0])
        for i, page in enumerate(pages):
            wrapper = WRAPPER % (XHTML_NS, i + 1, os.path.basename(page),
                                 i + 1)
            page = os.path.join(dir, 'page_%d.xhtml' % (i + 1))
            with open(page, 'wb') as f:
                f.write(wrapper.encode('utf-8'))
            wrappers.append(page)
        return wrappers

    def create_viewer_wrapper(self, pages, cdir):
        from calibre.ebooks.oeb.base import XHTML_NS

        def page(pnum, src):
            return '<img id="page_{}" src="{}"></img>'.format(
                pnum + 1, os.path.basename(src))

        pages = '\n'.join(page(i, src) for i, src in enumerate(pages))
        base = os.path.dirname(pages[0])
        wrapper = '''
        <html xmlns="%s">
            <head>
                <meta charset="utf-8"/>
                <style type="text/css">
                html, body, img { height: 100vh; display: block; margin: 0; padding: 0; border-width: 0; }
                img {
                    width: 100%%; height: 100%%;
                    object-fit: contain;
                    margin-left: auto; margin-right: auto;
                    max-width: 100vw; max-height: 100vh;
                    top: 50vh; transform: translateY(-50%%);
                    position: relative;
                    page-break-after: always;
                }
                </style>
            </head>
            <body>
            %s
            </body>
        </html>
        ''' % (XHTML_NS, pages)
        path = os.path.join(base, cdir, 'wrapper.xhtml')
        with open(path, 'wb') as f:
            f.write(wrapper.encode('utf-8'))
        return path
Esempio n. 16
0
class SNBOutput(OutputFormatPlugin):

    name = 'SNB Output'
    author = 'Li Fanxi'
    file_type = 'snb'
    commit_name = 'snb_output'

    options = {
        OptionRecommendation(
            name='snb_output_encoding',
            recommended_value='utf-8',
            level=OptionRecommendation.LOW,
            help=_('Specify the character encoding of the output document. '
                   'The default is utf-8.')),
        OptionRecommendation(
            name='snb_max_line_length',
            recommended_value=0,
            level=OptionRecommendation.LOW,
            help=
            _('The maximum number of characters per line. This splits on '
              'the first space before the specified value. If no space is found '
              'the line will be broken at the space after and will exceed the '
              'specified value. Also, there is a minimum of 25 characters. '
              'Use 0 to disable line splitting.')),
        OptionRecommendation(
            name='snb_insert_empty_line',
            recommended_value=False,
            level=OptionRecommendation.LOW,
            help=_('Specify whether or not to insert an empty line between '
                   'two paragraphs.')),
        OptionRecommendation(
            name='snb_dont_indent_first_line',
            recommended_value=False,
            level=OptionRecommendation.LOW,
            help=_('Specify whether or not to insert two space characters '
                   'to indent the first line of each paragraph.')),
        OptionRecommendation(
            name='snb_hide_chapter_name',
            recommended_value=False,
            level=OptionRecommendation.LOW,
            help=_('Specify whether or not to hide the chapter title for each '
                   'chapter. Useful for image-only output (eg. comics).')),
        OptionRecommendation(
            name='snb_full_screen',
            recommended_value=False,
            level=OptionRecommendation.LOW,
            help=_('Resize all the images for full screen mode. ')),
    }

    def convert(self, oeb_book, output_path, input_plugin, opts, log):
        from lxml import etree
        from calibre.ebooks.snb.snbfile import SNBFile
        from calibre.ebooks.snb.snbml import SNBMLizer, ProcessFileName

        self.opts = opts
        from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable
        try:
            rasterizer = SVGRasterizer()
            rasterizer(oeb_book, opts)
        except Unavailable:
            log.warn('SVG rasterizer unavailable, SVG will not be converted')

        # Create temp dir
        with TemporaryDirectory('_snb_output') as tdir:
            # Create stub directories
            snbfDir = os.path.join(tdir, 'snbf')
            snbcDir = os.path.join(tdir, 'snbc')
            snbiDir = os.path.join(tdir, 'snbc/images')
            os.mkdir(snbfDir)
            os.mkdir(snbcDir)
            os.mkdir(snbiDir)

            # Process Meta data
            meta = oeb_book.metadata
            if meta.title:
                title = str(meta.title[0])
            else:
                title = ''
            authors = [str(x) for x in meta.creator if x.role == 'aut']
            if meta.publisher:
                publishers = str(meta.publisher[0])
            else:
                publishers = ''
            if meta.language:
                lang = str(meta.language[0]).upper()
            else:
                lang = ''
            if meta.description:
                abstract = str(meta.description[0])
            else:
                abstract = ''

            # Process Cover
            g, m, s = oeb_book.guide, oeb_book.manifest, oeb_book.spine
            href = None
            if 'titlepage' not in g:
                if 'cover' in g:
                    href = g['cover'].href

            # Output book info file
            bookInfoTree = etree.Element("book-snbf", version="1.0")
            headTree = etree.SubElement(bookInfoTree, "head")
            etree.SubElement(headTree, "name").text = title
            etree.SubElement(headTree, "author").text = ' '.join(authors)
            etree.SubElement(headTree, "language").text = lang
            etree.SubElement(headTree, "rights")
            etree.SubElement(headTree, "publisher").text = publishers
            etree.SubElement(
                headTree, "generator").text = __appname__ + ' ' + __version__
            etree.SubElement(headTree, "created")
            etree.SubElement(headTree, "abstract").text = abstract
            if href is not None:
                etree.SubElement(headTree,
                                 "cover").text = ProcessFileName(href)
            else:
                etree.SubElement(headTree, "cover")
            with open(os.path.join(snbfDir, 'book.snbf'), 'wb') as f:
                f.write(
                    etree.tostring(bookInfoTree,
                                   pretty_print=True,
                                   encoding='utf-8'))

            # Output TOC
            tocInfoTree = etree.Element("toc-snbf")
            tocHead = etree.SubElement(tocInfoTree, "head")
            tocBody = etree.SubElement(tocInfoTree, "body")
            outputFiles = {}
            if oeb_book.toc.count() == 0:
                log.warn('This SNB file has no Table of Contents. '
                         'Creating a default TOC')
                first = next(iter(oeb_book.spine))
                oeb_book.toc.add(_('Start page'), first.href)
            else:
                first = next(iter(oeb_book.spine))
                if oeb_book.toc[0].href != first.href:
                    # The pages before the fist item in toc will be stored as
                    # "Cover Pages".
                    # oeb_book.toc does not support "insert", so we generate
                    # the tocInfoTree directly instead of modifying the toc
                    ch = etree.SubElement(tocBody, "chapter")
                    ch.set("src", ProcessFileName(first.href) + ".snbc")
                    ch.text = _('Cover pages')
                    outputFiles[first.href] = []
                    outputFiles[first.href].append(("", _("Cover pages")))

            for tocitem in oeb_book.toc:
                if tocitem.href.find('#') != -1:
                    item = tocitem.href.split('#')
                    if len(item) != 2:
                        log.error('Error in TOC item: %s' % tocitem)
                    else:
                        if item[0] in outputFiles:
                            outputFiles[item[0]].append(
                                (item[1], tocitem.title))
                        else:
                            outputFiles[item[0]] = []
                            if "" not in outputFiles[item[0]]:
                                outputFiles[item[0]].append(
                                    ("", tocitem.title + _(" (Preface)")))
                                ch = etree.SubElement(tocBody, "chapter")
                                ch.set("src",
                                       ProcessFileName(item[0]) + ".snbc")
                                ch.text = tocitem.title + _(" (Preface)")
                            outputFiles[item[0]].append(
                                (item[1], tocitem.title))
                else:
                    if tocitem.href in outputFiles:
                        outputFiles[tocitem.href].append(("", tocitem.title))
                    else:
                        outputFiles[tocitem.href] = []
                        outputFiles[tocitem.href].append(("", tocitem.title))
                ch = etree.SubElement(tocBody, "chapter")
                ch.set("src", ProcessFileName(tocitem.href) + ".snbc")
                ch.text = tocitem.title

            etree.SubElement(tocHead, "chapters").text = '%d' % len(tocBody)

            with open(os.path.join(snbfDir, 'toc.snbf'), 'wb') as f:
                f.write(
                    etree.tostring(tocInfoTree,
                                   pretty_print=True,
                                   encoding='utf-8'))

            # Output Files
            oldTree = None
            mergeLast = False
            lastName = None
            for item in s:
                from calibre.ebooks.oeb.base import OEB_DOCS, OEB_IMAGES
                if m.hrefs[item.href].media_type in OEB_DOCS:
                    if item.href not in outputFiles:
                        log.debug(
                            'File %s is unused in TOC. Continue in last chapter'
                            % item.href)
                        mergeLast = True
                    else:
                        if oldTree is not None and mergeLast:
                            log.debug('Output the modified chapter again: %s' %
                                      lastName)
                            with open(os.path.join(snbcDir, lastName),
                                      'wb') as f:
                                f.write(
                                    etree.tostring(oldTree,
                                                   pretty_print=True,
                                                   encoding='utf-8'))
                            mergeLast = False

                    log.debug('Converting %s to snbc...' % item.href)
                    snbwriter = SNBMLizer(log)
                    snbcTrees = None
                    if not mergeLast:
                        snbcTrees = snbwriter.extract_content(
                            oeb_book, item, outputFiles[item.href], opts)
                        for subName in snbcTrees:
                            postfix = ''
                            if subName != '':
                                postfix = '_' + subName
                            lastName = ProcessFileName(item.href + postfix +
                                                       ".snbc")
                            oldTree = snbcTrees[subName]
                            with open(os.path.join(snbcDir, lastName),
                                      'wb') as f:
                                f.write(
                                    etree.tostring(oldTree,
                                                   pretty_print=True,
                                                   encoding='utf-8'))
                    else:
                        log.debug('Merge %s with last TOC item...' % item.href)
                        snbwriter.merge_content(oldTree, oeb_book, item,
                                                [('', _("Start"))], opts)

            # Output the last one if needed
            log.debug('Output the last modified chapter again: %s' % lastName)
            if oldTree is not None and mergeLast:
                with open(os.path.join(snbcDir, lastName), 'wb') as f:
                    f.write(
                        etree.tostring(oldTree,
                                       pretty_print=True,
                                       encoding='utf-8'))
                mergeLast = False

            for item in m:
                if m.hrefs[item.href].media_type in OEB_IMAGES:
                    log.debug('Converting image: %s ...' % item.href)
                    content = m.hrefs[item.href].data
                    # Convert & Resize image
                    self.HandleImage(
                        content,
                        os.path.join(snbiDir, ProcessFileName(item.href)))

            # Package as SNB File
            snbFile = SNBFile()
            snbFile.FromDir(tdir)
            snbFile.Output(output_path)

    def HandleImage(self, imageData, imagePath):
        from calibre.utils.img import image_from_data, resize_image, image_to_data
        img = image_from_data(imageData)
        x, y = img.width(), img.height()
        if self.opts:
            if self.opts.snb_full_screen:
                SCREEN_X, SCREEN_Y = self.opts.output_profile.screen_size
            else:
                SCREEN_X, SCREEN_Y = self.opts.output_profile.comic_screen_size
        else:
            SCREEN_X = 540
            SCREEN_Y = 700
        # Handle big image only
        if x > SCREEN_X or y > SCREEN_Y:
            xScale = float(x) / SCREEN_X
            yScale = float(y) / SCREEN_Y
            scale = max(xScale, yScale)
            # TODO : intelligent image rotation
            #     img = img.rotate(90)
            #     x,y = y,x
            img = resize_image(img, x // scale, y // scale)
        with lopen(imagePath, 'wb') as f:
            f.write(image_to_data(img, fmt=imagePath.rpartition('.')[-1]))
Esempio n. 17
0
class TXTInput(InputFormatPlugin):

    name        = 'TXT Input'
    author      = 'John Schember'
    description = 'Convert TXT files to HTML'
    file_types  = {'txt', 'txtz', 'text', 'md', 'textile', 'markdown'}
    commit_name = 'txt_input'
    ui_data = {
        'md_extensions': MD_EXTENSIONS,
        'paragraph_types': {
            'auto': _('Try to auto detect paragraph type'),
            'block': _('Treat a blank line as a paragraph break'),
            'single': _('Assume every line is a paragraph'),
            'print': _('Assume every line starting with 2+ spaces or a tab starts a paragraph'),
            'unformatted': _('Most lines have hard line breaks, few/no blank lines or indents'),
            'off': _('Don\'t modify the paragraph structure'),
        },
        'formatting_types': {
            'auto': _('Automatically decide which formatting processor to use'),
            'plain': _('No formatting'),
            'heuristic': _('Use heuristics to determine chapter headings, italics, etc.'),
            'textile': _('Use the TexTile markup language'),
            'markdown': _('Use the Markdown markup language')
        },
    }

    options = {
        OptionRecommendation(name='formatting_type', recommended_value='auto',
            choices=list(ui_data['formatting_types']),
            help=_('Formatting used within the document.\n'
                   '* auto: {auto}\n'
                   '* plain: {plain}\n'
                   '* heuristic: {heuristic}\n'
                   '* textile: {textile}\n'
                   '* markdown: {markdown}\n'
                   'To learn more about markdown see {url}').format(
                       url='https://daringfireball.net/projects/markdown/', **ui_data['formatting_types'])
        ),
        OptionRecommendation(name='paragraph_type', recommended_value='auto',
            choices=list(ui_data['paragraph_types']),
            help=_('Paragraph structure to assume. The value of "off" is useful for formatted documents such as Markdown or Textile. '
                   'Choices are:\n'
                   '* auto: {auto}\n'
                   '* block: {block}\n'
                   '* single: {single}\n'
                   '* print:  {print}\n'
                   '* unformatted: {unformatted}\n'
                   '* off: {off}').format(**ui_data['paragraph_types'])
        ),
        OptionRecommendation(name='preserve_spaces', recommended_value=False,
            help=_('Normally extra spaces are condensed into a single space. '
                'With this option all spaces will be displayed.')),
        OptionRecommendation(name='txt_in_remove_indents', recommended_value=False,
            help=_('Normally extra space at the beginning of lines is retained. '
                   'With this option they will be removed.')),
        OptionRecommendation(name="markdown_extensions", recommended_value='footnotes, tables, toc',
            help=_('Enable extensions to markdown syntax. Extensions are formatting that is not part '
                   'of the standard markdown format. The extensions enabled by default: %default.\n'
                   'To learn more about markdown extensions, see {}\n'
                   'This should be a comma separated list of extensions to enable:\n'
                   ).format('https://python-markdown.github.io/extensions/') + '\n'.join('* %s: %s' % (k, MD_EXTENSIONS[k]) for k in sorted(MD_EXTENSIONS))),
    }

    def shift_file(self, base_dir, fname, data):
        name, ext = os.path.splitext(fname)
        c = 1
        while os.path.exists(os.path.join(base_dir, '{}-{}{}'.format(name, c, ext))):
            c += 1
        ans = os.path.join(base_dir, '{}-{}{}'.format(name, c, ext))
        with open(ans, 'wb') as f:
            f.write(data)
        return f.name

    def fix_resources(self, html, base_dir):
        from html5_parser import parse
        root = parse(html)
        changed = False
        for img in root.xpath('//img[@src]'):
            src = img.get('src')
            prefix = src.split(':', 1)[0].lower()
            if prefix not in ('file', 'http', 'https', 'ftp') and not os.path.isabs(src):
                src = os.path.join(base_dir, src)
                if os.access(src, os.R_OK):
                    with open(src, 'rb') as f:
                        data = f.read()
                    f = self.shift_file(base_dir, os.path.basename(src), data)
                    changed = True
                    img.set('src', os.path.basename(f))
        if changed:
            from lxml import etree
            html = etree.tostring(root, encoding='unicode')
        return html

    def convert(self, stream, options, file_ext, log,
                accelerators):
        from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
        from calibre.ebooks.chardet import detect
        from calibre.utils.zipfile import ZipFile
        from calibre.ebooks.txt.processor import (convert_basic,
                convert_markdown_with_metadata, separate_paragraphs_single_line,
                separate_paragraphs_print_formatted, preserve_spaces,
                detect_paragraph_type, detect_formatting_type,
                normalize_line_endings, convert_textile, remove_indents,
                block_to_single_line, separate_hard_scene_breaks)

        self.log = log
        txt = b''
        log.debug('Reading text from file...')
        length = 0
        base_dir = getcwd()

        # Extract content from zip archive.
        if file_ext == 'txtz':
            zf = ZipFile(stream)
            zf.extractall('.')

            for x in walk('.'):
                if os.path.splitext(x)[1].lower() in ('.txt', '.text'):
                    with open(x, 'rb') as tf:
                        txt += tf.read() + b'\n\n'
        else:
            if getattr(stream, 'name', None):
                base_dir = os.path.dirname(stream.name)
            txt = stream.read()
            if file_ext in {'md', 'textile', 'markdown'}:
                options.formatting_type = {'md': 'markdown'}.get(file_ext, file_ext)
                log.info('File extension indicates particular formatting. '
                        'Forcing formatting type to: %s'%options.formatting_type)
                options.paragraph_type = 'off'

        # Get the encoding of the document.
        if options.input_encoding:
            ienc = options.input_encoding
            log.debug('Using user specified input encoding of %s' % ienc)
        else:
            det_encoding = detect(txt[:4096])
            det_encoding, confidence = det_encoding['encoding'], det_encoding['confidence']
            if det_encoding and det_encoding.lower().replace('_', '-').strip() in (
                    'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn',
                    'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
                # Microsoft Word exports to HTML with encoding incorrectly set to
                # gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
                det_encoding = 'gbk'
            ienc = det_encoding
            log.debug('Detected input encoding as %s with a confidence of %s%%' % (ienc, confidence * 100))
        if not ienc:
            ienc = 'utf-8'
            log.debug('No input encoding specified and could not auto detect using %s' % ienc)
        # Remove BOM from start of txt as its presence can confuse markdown
        import codecs
        for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8, codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE):
            if txt.startswith(bom):
                txt = txt[len(bom):]
                break
        txt = txt.decode(ienc, 'replace')

        # Replace entities
        txt = _ent_pat.sub(xml_entity_to_unicode, txt)

        # Normalize line endings
        txt = normalize_line_endings(txt)

        # Determine the paragraph type of the document.
        if options.paragraph_type == 'auto':
            options.paragraph_type = detect_paragraph_type(txt)
            if options.paragraph_type == 'unknown':
                log.debug('Could not reliably determine paragraph type using block')
                options.paragraph_type = 'block'
            else:
                log.debug('Auto detected paragraph type as %s' % options.paragraph_type)

        # Detect formatting
        if options.formatting_type == 'auto':
            options.formatting_type = detect_formatting_type(txt)
            log.debug('Auto detected formatting as %s' % options.formatting_type)

        if options.formatting_type == 'heuristic':
            setattr(options, 'enable_heuristics', True)
            setattr(options, 'unwrap_lines', False)
            setattr(options, 'smarten_punctuation', True)

        # Reformat paragraphs to block formatting based on the detected type.
        # We don't check for block because the processor assumes block.
        # single and print at transformed to block for processing.
        if options.paragraph_type == 'single':
            txt = separate_paragraphs_single_line(txt)
        elif options.paragraph_type == 'print':
            txt = separate_hard_scene_breaks(txt)
            txt = separate_paragraphs_print_formatted(txt)
            txt = block_to_single_line(txt)
        elif options.paragraph_type == 'unformatted':
            from calibre.ebooks.conversion.utils import HeuristicProcessor
            # unwrap lines based on punctuation
            docanalysis = DocAnalysis('txt', txt)
            length = docanalysis.line_length(.5)
            preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
            txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
            txt = separate_paragraphs_single_line(txt)
        elif options.paragraph_type == 'block':
            txt = separate_hard_scene_breaks(txt)
            txt = block_to_single_line(txt)

        if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False):
            docanalysis = DocAnalysis('txt', txt)
            if not length:
                length = docanalysis.line_length(.5)
            dehyphenator = Dehyphenator(options.verbose, log=self.log)
            txt = dehyphenator(txt,'txt', length)

        # User requested transformation on the text.
        if options.txt_in_remove_indents:
            txt = remove_indents(txt)

        # Preserve spaces will replace multiple spaces to a space
        # followed by the &nbsp; entity.
        if options.preserve_spaces:
            txt = preserve_spaces(txt)

        # Process the text using the appropriate text processor.
        self.shifted_files = []
        try:
            html = ''
            input_mi = None
            if options.formatting_type == 'markdown':
                log.debug('Running text through markdown conversion...')
                try:
                    input_mi, html = convert_markdown_with_metadata(txt, extensions=[x.strip() for x in options.markdown_extensions.split(',') if x.strip()])
                except RuntimeError:
                    raise ValueError('This txt file has malformed markup, it cannot be'
                        ' converted by calibre. See https://daringfireball.net/projects/markdown/syntax')
                html = self.fix_resources(html, base_dir)
            elif options.formatting_type == 'textile':
                log.debug('Running text through textile conversion...')
                html = convert_textile(txt)
                html = self.fix_resources(html, base_dir)
            else:
                log.debug('Running text through basic conversion...')
                flow_size = getattr(options, 'flow_size', 0)
                html = convert_basic(txt, epub_split_size_kb=flow_size)

            # Run the HTMLized text through the html processing plugin.
            from calibre.customize.ui import plugin_for_input_format
            html_input = plugin_for_input_format('html')
            for opt in html_input.options:
                setattr(options, opt.option.name, opt.recommended_value)
            options.input_encoding = 'utf-8'
            htmlfile = self.shift_file(base_dir, 'index.html', html.encode('utf-8'))
            odi = options.debug_pipeline
            options.debug_pipeline = None
            # Generate oeb from html conversion.
            oeb = html_input.convert(open(htmlfile, 'rb'), options, 'html', log, {})
            options.debug_pipeline = odi
        finally:
            for x in self.shifted_files:
                os.remove(x)

        # Set metadata from file.
        if input_mi is None:
            from calibre.customize.ui import get_file_type_metadata
            input_mi = get_file_type_metadata(stream, file_ext)
        from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
        meta_info_to_oeb_metadata(input_mi, oeb.metadata, log)
        self.html_postprocess_title = input_mi.title

        return oeb

    def postprocess_book(self, oeb, opts, log):
        for item in oeb.spine:
            if hasattr(item.data, 'xpath'):
                for title in item.data.xpath('//*[local-name()="title"]'):
                    if title.text == _('Unknown'):
                        title.text = self.html_postprocess_title
Esempio n. 18
0
class FB2Output(OutputFormatPlugin):

    name = 'FB2 Output'
    author = 'John Schember'
    file_type = 'fb2'
    commit_name = 'fb2_output'

    FB2_GENRES = [
        # Science Fiction & Fantasy
        'sf_history',  # Alternative history
        'sf_action',  # Action
        'sf_epic',  # Epic
        'sf_heroic',  # Heroic
        'sf_detective',  # Detective
        'sf_cyberpunk',  # Cyberpunk
        'sf_space',  # Space
        'sf_social',  # Social#philosophical
        'sf_horror',  # Horror & mystic
        'sf_humor',  # Humor
        'sf_fantasy',  # Fantasy
        'sf',  # Science Fiction
        # Detectives & Thrillers
        'det_classic',  # Classical detectives
        'det_police',  # Police Stories
        'det_action',  # Action
        'det_irony',  # Ironical detectives
        'det_history',  # Historical detectives
        'det_espionage',  # Espionage detectives
        'det_crime',  # Crime detectives
        'det_political',  # Political detectives
        'det_maniac',  # Maniacs
        'det_hard',  # Hard#boiled
        'thriller',  # Thrillers
        'detective',  # Detectives
        # Prose
        'prose_classic',  # Classics prose
        'prose_history',  # Historical prose
        'prose_contemporary',  # Contemporary prose
        'prose_counter',  # Counterculture
        'prose_rus_classic',  # Russial classics prose
        'prose_su_classics',  # Soviet classics prose
        # Romance
        'love_contemporary',  # Contemporary Romance
        'love_history',  # Historical Romance
        'love_detective',  # Detective Romance
        'love_short',  # Short Romance
        'love_erotica',  # Erotica
        # Adventure
        'adv_western',  # Western
        'adv_history',  # History
        'adv_indian',  # Indians
        'adv_maritime',  # Maritime Fiction
        'adv_geo',  # Travel & geography
        'adv_animal',  # Nature & animals
        'adventure',  # Other
        # Children's
        'child_tale',  # Fairy Tales
        'child_verse',  # Verses
        'child_prose',  # Prose
        'child_sf',  # Science Fiction
        'child_det',  # Detectives & Thrillers
        'child_adv',  # Adventures
        'child_education',  # Educational
        'children',  # Other
        # Poetry & Dramaturgy
        'poetry',  # Poetry
        'dramaturgy',  # Dramaturgy
        # Antique literature
        'antique_ant',  # Antique
        'antique_european',  # European
        'antique_russian',  # Old russian
        'antique_east',  # Old east
        'antique_myths',  # Myths. Legends. Epos
        'antique',  # Other
        # Scientific#educational
        'sci_history',  # History
        'sci_psychology',  # Psychology
        'sci_culture',  # Cultural science
        'sci_religion',  # Religious studies
        'sci_philosophy',  # Philosophy
        'sci_politics',  # Politics
        'sci_business',  # Business literature
        'sci_juris',  # Jurisprudence
        'sci_linguistic',  # Linguistics
        'sci_medicine',  # Medicine
        'sci_phys',  # Physics
        'sci_math',  # Mathematics
        'sci_chem',  # Chemistry
        'sci_biology',  # Biology
        'sci_tech',  # Technical
        'science',  # Other
        # Computers & Internet
        'comp_www',  # Internet
        'comp_programming',  # Programming
        'comp_hard',  # Hardware
        'comp_soft',  # Software
        'comp_db',  # Databases
        'comp_osnet',  # OS & Networking
        'computers',  # Other
        # Reference
        'ref_encyc',  # Encyclopedias
        'ref_dict',  # Dictionaries
        'ref_ref',  # Reference
        'ref_guide',  # Guidebooks
        'reference',  # Other
        # Nonfiction
        'nonf_biography',  # Biography & Memoirs
        'nonf_publicism',  # Publicism
        'nonf_criticism',  # Criticism
        'design',  # Art & design
        'nonfiction',  # Other
        # Religion & Inspiration
        'religion_rel',  # Religion
        'religion_esoterics',  # Esoterics
        'religion_self',  # Self#improvement
        'religion',  # Other
        # Humor
        'humor_anecdote',  # Anecdote (funny stories)
        'humor_prose',  # Prose
        'humor_verse',  # Verses
        'humor',  # Other
        # Home & Family
        'home_cooking',  # Cooking
        'home_pets',  # Pets
        'home_crafts',  # Hobbies & Crafts
        'home_entertain',  # Entertaining
        'home_health',  # Health
        'home_garden',  # Garden
        'home_diy',  # Do it yourself
        'home_sport',  # Sports
        'home_sex',  # Erotica & sex
        'home',  # Other
    ]
    ui_data = {
        'sectionize': {
            'toc': _('Section per entry in the ToC'),
            'files': _('Section per file'),
            'nothing': _('A single section')
        },
        'genres': FB2_GENRES,
    }

    options = {
        OptionRecommendation(
            name='sectionize',
            recommended_value='files',
            level=OptionRecommendation.LOW,
            choices=list(ui_data['sectionize']),
            help=_(
                'Specify how sections are created:\n'
                ' * nothing: {nothing}\n'
                ' * files: {files}\n'
                ' * toc: {toc}\n'
                'If ToC based generation fails, adjust the "Structure detection" and/or "Table of Contents" settings '
                '(turn on "Force use of auto-generated Table of Contents").').
            format(**ui_data['sectionize'])),
        OptionRecommendation(
            name='fb2_genre',
            recommended_value='antique',
            level=OptionRecommendation.LOW,
            choices=FB2_GENRES,
            help=(_('Genre for the book. Choices: %s\n\n See: ') %
                  ', '.join(FB2_GENRES)) +
            'http://www.fictionbook.org/index.php/Eng:FictionBook_2.1_genres '
            + _('for a complete list with descriptions.')),
    }

    def convert(self, oeb_book, output_path, input_plugin, opts, log):
        from calibre.ebooks.oeb.transforms.jacket import linearize_jacket
        from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable
        from calibre.ebooks.fb2.fb2ml import FB2MLizer

        try:
            rasterizer = SVGRasterizer()
            rasterizer(oeb_book, opts)
        except Unavailable:
            log.warn('SVG rasterizer unavailable, SVG will not be converted')

        linearize_jacket(oeb_book)

        fb2mlizer = FB2MLizer(log)
        fb2_content = fb2mlizer.extract_content(oeb_book, opts)

        close = False
        if not hasattr(output_path, 'write'):
            close = True
            if not os.path.exists(os.path.dirname(
                    output_path)) and os.path.dirname(output_path) != '':
                os.makedirs(os.path.dirname(output_path))
            out_stream = lopen(output_path, 'wb')
        else:
            out_stream = output_path

        out_stream.seek(0)
        out_stream.truncate()
        out_stream.write(fb2_content.encode('utf-8', 'replace'))

        if close:
            out_stream.close()
Esempio n. 19
0
class MOBIOutput(OutputFormatPlugin):

    name = 'MOBI Output'
    author = 'Kovid Goyal'
    file_type = 'mobi'
    commit_name = 'mobi_output'
    ui_data = {'file_types': ['old', 'both', 'new']}

    options = {
        OptionRecommendation(
            name='prefer_author_sort',
            recommended_value=False,
            level=OptionRecommendation.LOW,
            help=_('When present, use author sort field as author.')),
        OptionRecommendation(
            name='no_inline_toc',
            recommended_value=False,
            level=OptionRecommendation.LOW,
            help=_('Don\'t add Table of Contents to the book. Useful if '
                   'the book has its own table of contents.')),
        OptionRecommendation(
            name='toc_title',
            recommended_value=None,
            help=_('Title for any generated in-line table of contents.')),
        OptionRecommendation(
            name='dont_compress',
            recommended_value=False,
            level=OptionRecommendation.LOW,
            help=_('Disable compression of the file contents.')),
        OptionRecommendation(
            name='personal_doc',
            recommended_value='[PDOC]',
            help=_('Tag for MOBI files to be marked as personal documents.'
                   ' This option has no effect on the conversion. It is used'
                   ' only when sending MOBI files to a device. If the file'
                   ' being sent has the specified tag, it will be marked as'
                   ' a personal document when sent to the Kindle.')),
        OptionRecommendation(
            name='mobi_ignore_margins',
            recommended_value=False,
            help=_(
                'Ignore margins in the input document. If False, then '
                'the MOBI output plugin will try to convert margins specified'
                ' in the input document, otherwise it will ignore them.')),
        OptionRecommendation(
            name='mobi_toc_at_start',
            recommended_value=False,
            help=_(
                'When adding the Table of Contents to the book, add it at the start of the '
                'book instead of the end. Not recommended.')),
        OptionRecommendation(
            name='extract_to',
            help=_('Extract the contents of the generated %s file to the '
                   'specified folder. The contents of the folder are first '
                   'deleted, so be careful.') % 'MOBI'),
        OptionRecommendation(
            name='share_not_sync',
            recommended_value=False,
            help=_('Enable sharing of book content via Facebook etc. '
                   ' on the Kindle. WARNING: Using this feature means that '
                   ' the book will not auto sync its last read position '
                   ' on multiple devices. Complain to Amazon.')),
        OptionRecommendation(
            name='mobi_keep_original_images',
            recommended_value=False,
            help=_(
                'By default calibre converts all images to JPEG format '
                'in the output MOBI file. This is for maximum compatibility '
                'as some older MOBI viewers have problems with other image '
                'formats. This option tells calibre not to do this. '
                'Useful if your document contains lots of GIF/PNG images that '
                'become very large when converted to JPEG.')),
        OptionRecommendation(
            name='mobi_file_type',
            choices=ui_data['file_types'],
            recommended_value='old',
            help=_(
                'By default calibre generates MOBI files that contain the '
                'old MOBI 6 format. This format is compatible with all '
                'devices. However, by changing this setting, you can tell '
                'calibre to generate MOBI files that contain both MOBI 6 and '
                'the new KF8 format, or only the new KF8 format. KF8 has '
                'more features than MOBI 6, but only works with newer Kindles. '
                'Allowed values: {}').format('old, both, new')),
    }

    def check_for_periodical(self):
        if self.is_periodical:
            self.periodicalize_toc()
            self.check_for_masthead()
            self.opts.mobi_periodical = True
        else:
            self.opts.mobi_periodical = False

    def check_for_masthead(self):
        found = 'masthead' in self.oeb.guide
        if not found:
            from calibre.ebooks import generate_masthead
            self.oeb.log.debug(
                'No masthead found in manifest, generating default mastheadImage...'
            )
            raw = generate_masthead(unicode_type(
                self.oeb.metadata['title'][0]))
            id, href = self.oeb.manifest.generate('masthead', 'masthead')
            self.oeb.manifest.add(id, href, 'image/gif', data=raw)
            self.oeb.guide.add('masthead', 'Masthead Image', href)
        else:
            self.oeb.log.debug('Using mastheadImage supplied in manifest...')

    def periodicalize_toc(self):
        from calibre.ebooks.oeb.base import TOC
        toc = self.oeb.toc
        if not toc or len(self.oeb.spine) < 3:
            return
        if toc and toc[0].klass != 'periodical':
            one, two = self.oeb.spine[0], self.oeb.spine[1]
            self.log('Converting TOC for MOBI periodical indexing...')

            articles = {}
            if toc.depth() < 3:
                # single section periodical
                self.oeb.manifest.remove(one)
                self.oeb.manifest.remove(two)
                sections = [
                    TOC(klass='section',
                        title=_('All articles'),
                        href=self.oeb.spine[0].href)
                ]
                for x in toc:
                    sections[0].nodes.append(x)
            else:
                # multi-section periodical
                self.oeb.manifest.remove(one)
                sections = list(toc)
                for i, x in enumerate(sections):
                    x.klass = 'section'
                    articles_ = list(x)
                    if articles_:
                        self.oeb.manifest.remove(
                            self.oeb.manifest.hrefs[x.href])
                        x.href = articles_[0].href

            for sec in sections:
                articles[id(sec)] = []
                for a in list(sec):
                    a.klass = 'article'
                    articles[id(sec)].append(a)
                    sec.nodes.remove(a)

            root = TOC(klass='periodical',
                       href=self.oeb.spine[0].href,
                       title=unicode_type(self.oeb.metadata.title[0]))

            for s in sections:
                if articles[id(s)]:
                    for a in articles[id(s)]:
                        s.nodes.append(a)
                    root.nodes.append(s)

            for x in list(toc.nodes):
                toc.nodes.remove(x)

            toc.nodes.append(root)

            # Fix up the periodical href to point to first section href
            toc.nodes[0].href = toc.nodes[0].nodes[0].href

    def convert(self, oeb, output_path, input_plugin, opts, log):
        from calibre.ebooks.mobi.writer2.resources import Resources
        self.log, self.opts, self.oeb = log, opts, oeb

        mobi_type = opts.mobi_file_type
        if self.is_periodical:
            mobi_type = 'old'  # Amazon does not support KF8 periodicals
        create_kf8 = mobi_type in ('new', 'both')

        remove_html_cover(self.oeb, self.log)
        resources = Resources(oeb,
                              opts,
                              self.is_periodical,
                              add_fonts=create_kf8)
        self.check_for_periodical()

        if create_kf8:
            from calibre.ebooks.mobi.writer8.cleanup import remove_duplicate_anchors
            remove_duplicate_anchors(self.oeb)
            # Split on pagebreaks so that the resulting KF8 is faster to load
            from calibre.ebooks.oeb.transforms.split import Split
            Split()(self.oeb, self.opts)

        kf8 = self.create_kf8(resources, for_joint=mobi_type
                              == 'both') if create_kf8 else None
        if mobi_type == 'new':
            kf8.write(output_path)
            extract_mobi(output_path, opts)
            return

        self.log('Creating MOBI 6 output')
        self.write_mobi(input_plugin, output_path, kf8, resources)

    def create_kf8(self, resources, for_joint=False):
        from calibre.ebooks.mobi.writer8.main import create_kf8_book
        return create_kf8_book(self.oeb,
                               self.opts,
                               resources,
                               for_joint=for_joint)

    def write_mobi(self, input_plugin, output_path, kf8, resources):
        from calibre.ebooks.mobi.mobiml import MobiMLizer
        from calibre.ebooks.oeb.transforms.manglecase import CaseMangler
        from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable
        from calibre.ebooks.oeb.transforms.htmltoc import HTMLTOCAdder
        from calibre.customize.ui import plugin_for_input_format

        opts, oeb = self.opts, self.oeb
        if not opts.no_inline_toc:
            tocadder = HTMLTOCAdder(
                title=opts.toc_title,
                position='start' if opts.mobi_toc_at_start else 'end')
            tocadder(oeb, opts)
        mangler = CaseMangler()
        mangler(oeb, opts)
        try:
            rasterizer = SVGRasterizer()
            rasterizer(oeb, opts)
        except Unavailable:
            self.log.warn(
                'SVG rasterizer unavailable, SVG will not be converted')
        else:
            # Add rasterized SVG images
            resources.add_extra_images()
        if hasattr(self.oeb, 'inserted_metadata_jacket'):
            self.workaround_fire_bugs(self.oeb.inserted_metadata_jacket)
        mobimlizer = MobiMLizer(ignore_tables=opts.linearize_tables)
        mobimlizer(oeb, opts)
        write_page_breaks_after_item = input_plugin is not plugin_for_input_format(
            'cbz')
        from calibre.ebooks.mobi.writer2.main import MobiWriter
        writer = MobiWriter(
            opts,
            resources,
            kf8,
            write_page_breaks_after_item=write_page_breaks_after_item)
        writer(oeb, output_path)
        extract_mobi(output_path, opts)

    def specialize_css_for_output(self, log, opts, item, stylizer):
        from calibre.ebooks.mobi.writer8.cleanup import CSSCleanup
        CSSCleanup(log, opts)(item, stylizer)

    def workaround_fire_bugs(self, jacket):
        # The idiotic Fire crashes when trying to render the table used to
        # layout the jacket
        from calibre.ebooks.oeb.base import XHTML
        for table in jacket.data.xpath('//*[local-name()="table"]'):
            table.tag = XHTML('div')
            for tr in table.xpath('descendant::*[local-name()="tr"]'):
                cols = tr.xpath('descendant::*[local-name()="td"]')
                tr.tag = XHTML('div')
                for td in cols:
                    td.tag = XHTML('span' if cols else 'div')
Esempio n. 20
0
class AZW3Output(OutputFormatPlugin):

    name = 'AZW3 Output'
    author = 'Kovid Goyal'
    file_type = 'azw3'
    commit_name = 'azw3_output'

    options = {
        OptionRecommendation(
            name='prefer_author_sort',
            recommended_value=False,
            level=OptionRecommendation.LOW,
            help=_('When present, use author sort field as author.')),
        OptionRecommendation(
            name='no_inline_toc',
            recommended_value=False,
            level=OptionRecommendation.LOW,
            help=_('Don\'t add Table of Contents to the book. Useful if '
                   'the book has its own table of contents.')),
        OptionRecommendation(
            name='toc_title',
            recommended_value=None,
            help=_('Title for any generated in-line table of contents.')),
        OptionRecommendation(
            name='dont_compress',
            recommended_value=False,
            level=OptionRecommendation.LOW,
            help=_('Disable compression of the file contents.')),
        OptionRecommendation(
            name='mobi_toc_at_start',
            recommended_value=False,
            help=_(
                'When adding the Table of Contents to the book, add it at the start of the '
                'book instead of the end. Not recommended.')),
        OptionRecommendation(
            name='extract_to',
            help=_('Extract the contents of the generated %s file to the '
                   'specified folder. The contents of the folder are first '
                   'deleted, so be careful.') % 'AZW3'),
        OptionRecommendation(
            name='share_not_sync',
            recommended_value=False,
            help=_('Enable sharing of book content via Facebook etc. '
                   ' on the Kindle. WARNING: Using this feature means that '
                   ' the book will not auto sync its last read position '
                   ' on multiple devices. Complain to Amazon.')),
    }

    def convert(self, oeb, output_path, input_plugin, opts, log):
        from calibre.ebooks.mobi.writer2.resources import Resources
        from calibre.ebooks.mobi.writer8.main import create_kf8_book
        from calibre.ebooks.mobi.writer8.cleanup import remove_duplicate_anchors

        self.oeb, self.opts, self.log = oeb, opts, log
        opts.mobi_periodical = self.is_periodical
        passthrough = getattr(opts, 'mobi_passthrough', False)
        remove_duplicate_anchors(oeb)

        resources = Resources(self.oeb,
                              self.opts,
                              self.is_periodical,
                              add_fonts=True,
                              process_images=False)
        if not passthrough:
            remove_html_cover(self.oeb, self.log)

            # Split on pagebreaks so that the resulting KF8 is faster to load
            from calibre.ebooks.oeb.transforms.split import Split
            Split()(self.oeb, self.opts)

        kf8 = create_kf8_book(self.oeb, self.opts, resources, for_joint=False)

        kf8.write(output_path)
        extract_mobi(output_path, opts)

    def specialize_css_for_output(self, log, opts, item, stylizer):
        from calibre.ebooks.mobi.writer8.cleanup import CSSCleanup
        CSSCleanup(log, opts)(item, stylizer)
Esempio n. 21
0
class KEPUBInput(EPUBInput):
    """Extension of calibre's EPUBInput to understand KePub format books."""

    name = "KePub Input"
    description = "Convert KEPUB files (.kepub) to HTML"
    author = "David Forrester"
    file_types = {"kepub"}
    version = plugin_version
    minimum_calibre_version = (0, 1, 0)

    options = {
        OptionRecommendation(
            name="strip_kobo_spans",
            recommended_value=True,
            help=_(  # noqa: F821
                "Kepubs have spans wrapping each sentence. These are used by "
                "the ereader for the reading location and bookmark location. "
                "They are not used by an ePub reader but are valid code and "
                "can be safely be left in the ePub. If you plan to edit the "
                "ePub, it is recommended that you remove the spans."),
        )
    }

    recommendations = set([])

    def gui_configuration_widget(self,
                                 parent,
                                 get_option_by_name,
                                 get_option_help,
                                 db,
                                 book_id=None):
        """Set up the input processor's configuration widget."""
        from calibre_plugins.kepubin.conversion.input_config import PluginWidget

        return PluginWidget(parent, get_option_by_name, get_option_help, db,
                            book_id)

    def convert(self, stream, options, file_ext, log, accelerators):
        """Convert a KePub file into a structure calibre can process."""
        log("KEPUBInput::convert - start")
        from calibre.utils.zipfile import ZipFile
        from calibre import walk
        from calibre.ebooks import DRMError
        from calibre.ebooks.metadata.opf2 import OPF

        try:
            zf = ZipFile(stream)
            cwd = os.getcwdu() if sys.version_info.major == 2 else os.getcwd()
            zf.extractall(cwd)
        except Exception:
            log.exception("KEPUB appears to be invalid ZIP file, trying a "
                          "more forgiving ZIP parser")
            from calibre.utils.localunzip import extractall

            stream.seek(0)
            extractall(stream)
        opf = self.find_opf()
        if opf is None:
            for f in walk("."):
                if (f.lower().endswith(".opf") and "__MACOSX" not in f
                        and not os.path.basename(f).startswith(".")):
                    opf = os.path.abspath(f)
                    break
        path = getattr(stream, "name", "stream")

        if opf is None:
            raise ValueError(
                _(  # noqa: F821
                    "{0} is not a valid KEPUB file (could not find opf)").
                format(path))

        encfile = os.path.abspath("rights.xml")
        if os.path.exists(encfile):
            raise DRMError(os.path.basename(path))

        cwd = os.getcwdu() if sys.version_info.major == 2 else os.getcwd()
        opf = os.path.relpath(opf, cwd)
        parts = os.path.split(opf)
        opf = OPF(opf, os.path.dirname(os.path.abspath(opf)))

        self.encrypted_fonts = []

        if len(parts) > 1 and parts[0]:
            delta = "/".join(parts[:-1]) + "/"
            for elem in opf.itermanifest():
                elem.set("href", delta + elem.get("href"))
            for elem in opf.iterguide():
                elem.set("href", delta + elem.get("href"))

        f = (self.rationalize_cover3
             if opf.package_version >= 3.0 else self.rationalize_cover2)
        self.removed_cover = f(opf, log)

        self.optimize_opf_parsing = opf
        for x in opf.itermanifest():
            if x.get("media-type", "") == "application/x-dtbook+xml":
                raise ValueError(
                    _("EPUB files with DTBook markup are not supported"
                      )  # noqa: F821
                )

        not_for_spine = set()
        for y in opf.itermanifest():
            id_ = y.get("id", None)
            if id_ and y.get("media-type", None) in {
                    "application/vnd.adobe-page-template+xml",
                    "application/vnd.adobe.page-template+xml",
                    "application/adobe-page-template+xml",
                    "application/adobe.page-template+xml",
                    "application/text",
            }:
                not_for_spine.add(id_)

        seen = set()
        for x in list(opf.iterspine()):
            ref = x.get("idref", None)
            if not ref or ref in not_for_spine or ref in seen:
                x.getparent().remove(x)
                continue
            seen.add(ref)

        if len(list(opf.iterspine())) == 0:
            raise ValueError(
                _("No valid entries in the spine of this EPUB")  # noqa: F821
            )

        with open("content.opf", "wb") as nopf:
            nopf.write(opf.render())

        return os.path.abspath("content.opf")

    def postprocess_book(self, oeb, opts, log):
        """Perform any needed post-input processing on the book."""
        log("KEPUBInput::postprocess_book - start")
        from calibre.ebooks.oeb.base import XHTML_NS

        # The Kobo spans wrap each sentence. Remove them and add their text to
        # the parent tag.
        def refactor_span(a):
            p = a.getparent()
            idx = p.index(a) - 1
            p.remove(a)

            if idx < 0:
                if p.text is None:
                    p.text = ""
                p.text += a.text if a.text else ""
                p.text += a.tail if a.tail else ""
            else:
                if p[idx].tail is None:
                    p[idx].tail = ""
                p[idx].tail += a.text if a.text else ""
                p[idx].tail += a.tail if a.tail else ""

        super(KEPUBInput, self).postprocess_book(oeb, opts, log)

        if not opts.strip_kobo_spans:
            log("KEPUBInput::postprocess_book - not stripping kobo spans")
            return

        for item in oeb.spine:
            log("item.__class__.__name__", item.__class__.__name__)
            if not hasattr(item.data, "xpath"):
                continue

            for a in item.data.xpath('//h:span[@class="koboSpan"]',
                                     namespaces={"h": XHTML_NS}):
                refactor_span(a)

        log("KEPUBInput::postprocess_book - end")

    # Shouldn't get called, but overriding just in case.
    def process_encryption(self, encfile, opf, log):
        """Determine if encryption needs to be processed."""
        encfile = os.path.abspath("rights.xml")
        return not os.path.exists(encfile)
Esempio n. 22
0
class PDFOutput(OutputFormatPlugin):

    name = 'PDF Output'
    author = 'Kovid Goyal'
    file_type = 'pdf'

    options = set([
        OptionRecommendation(
            name='override_profile_size',
            recommended_value=False,
            help=_('Normally, the PDF page size is set by the output profile'
                   ' chosen under page options. This option will cause the '
                   ' page size settings under PDF Output to override the '
                   ' size specified by the output profile.')),
        OptionRecommendation(
            name='unit',
            recommended_value='inch',
            level=OptionRecommendation.LOW,
            short_switch='u',
            choices=UNITS,
            help=_(
                'The unit of measure for page sizes. Default is inch. Choices '
                'are %s '
                'Note: This does not override the unit for margins!') % UNITS),
        OptionRecommendation(
            name='paper_size',
            recommended_value='letter',
            level=OptionRecommendation.LOW,
            choices=PAPER_SIZES,
            help=
            _('The size of the paper. This size will be overridden when a '
              'non default output profile is used. Default is letter. Choices '
              'are %s') % PAPER_SIZES),
        OptionRecommendation(
            name='custom_size',
            recommended_value=None,
            help=_('Custom size of the document. Use the form widthxheight '
                   'EG. `123x321` to specify the width and height. '
                   'This overrides any specified paper-size.')),
        OptionRecommendation(
            name='preserve_cover_aspect_ratio',
            recommended_value=False,
            help=_('Preserve the aspect ratio of the cover, instead'
                   ' of stretching it to fill the full first page of the'
                   ' generated pdf.')),
        OptionRecommendation(
            name='pdf_serif_family',
            recommended_value='Liberation Serif'
            if islinux else 'Times New Roman',
            help=_('The font family used to render serif fonts')),
        OptionRecommendation(
            name='pdf_sans_family',
            recommended_value='Liberation Sans' if islinux else 'Helvetica',
            help=_('The font family used to render sans-serif fonts')),
        OptionRecommendation(
            name='pdf_mono_family',
            recommended_value='Liberation Mono' if islinux else 'Courier New',
            help=_('The font family used to render monospaced fonts')),
        OptionRecommendation(
            name='pdf_standard_font',
            choices=['serif', 'sans', 'mono'],
            recommended_value='serif',
            help=_('The font family used to render monospaced fonts')),
        OptionRecommendation(name='pdf_default_font_size',
                             recommended_value=20,
                             help=_('The default font size')),
        OptionRecommendation(
            name='pdf_mono_font_size',
            recommended_value=16,
            help=_('The default font size for monospaced text')),
        OptionRecommendation(
            name='pdf_mark_links',
            recommended_value=False,
            help=_(
                'Surround all links with a red box, useful for debugging.')),
        OptionRecommendation(
            name='old_pdf_engine',
            recommended_value=False,
            help=_('Use the old, less capable engine to generate the PDF')),
        OptionRecommendation(
            name='uncompressed_pdf',
            recommended_value=False,
            help=_('Generate an uncompressed PDF, useful for debugging, '
                   'only works with the new PDF engine.')),
        OptionRecommendation(
            name='pdf_page_numbers',
            recommended_value=False,
            help=_(
                'Add page numbers to the bottom of every page in the generated PDF file. If you '
                'specify a footer template, it will take precedence '
                'over this option.')),
        OptionRecommendation(
            name='pdf_footer_template',
            recommended_value=None,
            help=
            _('An HTML template used to generate %s on every page.'
              ' The strings _PAGENUM_, _TITLE_, _AUTHOR_ and _SECTION_ will be replaced by their current values.'
              ) % _('footers')),
        OptionRecommendation(
            name='pdf_header_template',
            recommended_value=None,
            help=
            _('An HTML template used to generate %s on every page.'
              ' The strings _PAGENUM_, _TITLE_, _AUTHOR_ and _SECTION_ will be replaced by their current values.'
              ) % _('headers')),
        OptionRecommendation(
            name='pdf_add_toc',
            recommended_value=False,
            help=
            _('Add a Table of Contents at the end of the PDF that lists page numbers. '
              'Useful if you want to print out the PDF. If this PDF is intended for electronic use, use the PDF Outline instead.'
              )),
        OptionRecommendation(name='toc_title',
                             recommended_value=None,
                             help=_('Title for generated table of contents.')),
    ])

    def convert(self, oeb_book, output_path, input_plugin, opts, log):
        from calibre.gui2 import must_use_qt, load_builtin_fonts
        must_use_qt()
        load_builtin_fonts()

        self.oeb = oeb_book
        self.input_plugin, self.opts, self.log = input_plugin, opts, log
        self.output_path = output_path
        from calibre.ebooks.oeb.base import OPF, OPF2_NS
        from lxml import etree
        from io import BytesIO
        package = etree.Element(OPF('package'),
                                attrib={
                                    'version': '2.0',
                                    'unique-identifier': 'dummy'
                                },
                                nsmap={None: OPF2_NS})
        from calibre.ebooks.metadata.opf2 import OPF
        self.oeb.metadata.to_opf2(package)
        self.metadata = OPF(BytesIO(
            etree.tostring(package))).to_book_metadata()
        self.cover_data = None

        if input_plugin.is_image_collection:
            log.debug('Converting input as an image collection...')
            self.convert_images(input_plugin.get_images())
        else:
            log.debug('Converting input as a text based book...')
            self.convert_text(oeb_book)

    def convert_images(self, images):
        from calibre.ebooks.pdf.writer import ImagePDFWriter
        self.write(ImagePDFWriter, images, None)

    def get_cover_data(self):
        oeb = self.oeb
        if (oeb.metadata.cover
                and unicode(oeb.metadata.cover[0]) in oeb.manifest.ids):
            cover_id = unicode(oeb.metadata.cover[0])
            item = oeb.manifest.ids[cover_id]
            self.cover_data = item.data

    def handle_embedded_fonts(self):
        '''
        Because of QtWebKit's inability to handle embedded fonts correctly, we
        remove the embedded fonts and make them available system wide instead.
        If you ever move to Qt WebKit 2.3+ then this will be unnecessary.
        '''
        from calibre.ebooks.oeb.base import urlnormalize
        from calibre.utils.fonts.utils import remove_embed_restriction
        from PyQt4.Qt import QFontDatabase, QByteArray, QRawFont, QFont

        # First find all @font-face rules and remove them, adding the embedded
        # fonts to Qt
        family_map = {}
        for item in list(self.oeb.manifest):
            if not hasattr(item.data, 'cssRules'):
                continue
            remove = set()
            for i, rule in enumerate(item.data.cssRules):
                if rule.type == rule.FONT_FACE_RULE:
                    remove.add(i)
                    try:
                        s = rule.style
                        src = s.getProperty('src').propertyValue[0].uri
                        font_family = s.getProperty(
                            'font-family').propertyValue[0].value
                    except:
                        continue
                    path = item.abshref(src)
                    ff = self.oeb.manifest.hrefs.get(urlnormalize(path), None)
                    if ff is None:
                        continue

                    raw = ff.data
                    self.oeb.manifest.remove(ff)
                    try:
                        raw = remove_embed_restriction(raw)
                    except:
                        continue
                    fid = QFontDatabase.addApplicationFontFromData(
                        QByteArray(raw))
                    family_name = None
                    if fid > -1:
                        try:
                            family_name = unicode(
                                QFontDatabase.applicationFontFamilies(fid)[0])
                        except (IndexError, KeyError):
                            pass
                    if family_name:
                        family_map[icu_lower(font_family)] = family_name

            for i in sorted(remove, reverse=True):
                item.data.cssRules.pop(i)

        # Now map the font family name specified in the css to the actual
        # family name of the embedded font (they may be different in general).
        font_warnings = set()
        for item in self.oeb.manifest:
            if not hasattr(item.data, 'cssRules'):
                continue
            for i, rule in enumerate(item.data.cssRules):
                if rule.type != rule.STYLE_RULE:
                    continue
                ff = rule.style.getProperty('font-family')
                if ff is None:
                    continue
                val = ff.propertyValue
                for i in xrange(val.length):
                    try:
                        k = icu_lower(val[i].value)
                    except (AttributeError, TypeError):
                        val[i].value = k = 'times'
                    if k in family_map:
                        val[i].value = family_map[k]
                if iswindows:
                    # On windows, Qt uses GDI which does not support OpenType
                    # (CFF) fonts, so we need to nuke references to OpenType
                    # fonts. Note that you could compile QT with configure
                    # -directwrite, but that requires atleast Vista SP2
                    for i in xrange(val.length):
                        family = val[i].value
                        if family:
                            f = QRawFont.fromFont(QFont(family))
                            if len(f.fontTable('head')) == 0:
                                if family not in font_warnings:
                                    self.log.warn(
                                        'Ignoring unsupported font: %s' %
                                        family)
                                    font_warnings.add(family)
                                # Either a bitmap or (more likely) a CFF font
                                val[i].value = 'times'

    def convert_text(self, oeb_book):
        from calibre.ebooks.metadata.opf2 import OPF
        if self.opts.old_pdf_engine:
            from calibre.ebooks.pdf.writer import PDFWriter
            PDFWriter
        else:
            from calibre.ebooks.pdf.render.from_html import PDFWriter

        self.log.debug('Serializing oeb input to disk for processing...')
        self.get_cover_data()

        self.handle_embedded_fonts()

        with TemporaryDirectory('_pdf_out') as oeb_dir:
            from calibre.customize.ui import plugin_for_output_format
            oeb_output = plugin_for_output_format('oeb')
            oeb_output.convert(oeb_book, oeb_dir, self.input_plugin, self.opts,
                               self.log)

            opfpath = glob.glob(os.path.join(oeb_dir, '*.opf'))[0]
            opf = OPF(opfpath, os.path.dirname(opfpath))

            self.write(PDFWriter, [s.path for s in opf.spine],
                       getattr(opf, 'toc', None))

    def write(self, Writer, items, toc):
        writer = Writer(self.opts,
                        self.log,
                        cover_data=self.cover_data,
                        toc=toc)
        writer.report_progress = self.report_progress

        close = False
        if not hasattr(self.output_path, 'write'):
            close = True
            if not os.path.exists(os.path.dirname(
                    self.output_path)) and os.path.dirname(
                        self.output_path) != '':
                os.makedirs(os.path.dirname(self.output_path))
            out_stream = open(self.output_path, 'wb')
        else:
            out_stream = self.output_path

        out_stream.seek(0)
        out_stream.truncate()
        self.log.debug('Rendering pages to PDF...')
        import time
        st = time.time()
        if False:
            import cProfile
            cProfile.runctx(
                'writer.dump(items, out_stream, PDFMetadata(self.metadata))',
                globals(), locals(), '/tmp/profile')
        else:
            writer.dump(items, out_stream, PDFMetadata(self.metadata))
        self.log('Rendered PDF in %g seconds:' % (time.time() - st))

        if close:
            out_stream.close()

    def specialize_css_for_output(self, log, opts, item, stylizer):
        ''' Qt WebKit (4.8.x) cannot handle font-variant: small-caps. It tries to fake the small caps,
        which is ok, but the faking continues on to subsequent text that should not be in small-caps.
        So we workaround the problem by faking small caps ourselves. A minimal example that Qt chokes on:
        <html><body>
        <p style="font-variant:small-caps">Some Small-caps Text</p>
        <p style="text-align:justify">Some non small-caps text with enough text for at least one
        full line and justification enabled. Both of these are needed for the example to work.</p>
        </body></html> '''
        from calibre.ebooks.oeb.base import XHTML
        import itertools, string
        if not hasattr(item.data, 'xpath'):
            return
        ws = unicode(string.whitespace)

        def fake_small_caps(elem):
            spans = []
            for lowercase, textiter in itertools.groupby(
                    elem.text, lambda x: x not in ws and icu_lower(x) == x):
                text = ''.join(textiter)
                if lowercase:
                    text = icu_upper(text)
                span = elem.makeelement(XHTML('span'))
                span.text = text
                style = stylizer.style(span)
                if lowercase:
                    style.set('font-size', '0.65em')
                spans.append(span)
            elem.text = None
            elem[0:] = spans

        def process_elem(elem, parent_fv=None):
            children = tuple(elem)
            style = stylizer.style(elem)
            fv = style.drop('font-variant')
            if not fv or fv.lower() == 'inherit':
                fv = parent_fv
            if fv and fv.lower() in {'smallcaps', 'small-caps'}:
                if elem.text:
                    fake_small_caps(elem)
            for child in children:
                if hasattr(getattr(child, 'tag', None), 'lower'):
                    process_elem(child, parent_fv=fv)

        for body in item.data.xpath('//*[local-name()="body"]'):
            process_elem(body)
Esempio n. 23
0
class PDFOutput(OutputFormatPlugin):

    name = 'PDF Output'
    author = 'Kovid Goyal'
    file_type = 'pdf'
    commit_name = 'pdf_output'
    ui_data = {'paper_sizes': PAPER_SIZES, 'units': UNITS, 'font_types': ('serif', 'sans', 'mono')}

    options = {
        OptionRecommendation(name='use_profile_size', recommended_value=False,
            help=_('Instead of using the paper size specified in the PDF Output options,'
                   ' use a paper size corresponding to the current output profile.'
                   ' Useful if you want to generate a PDF for viewing on a specific device.')),
        OptionRecommendation(name='unit', recommended_value='inch',
            level=OptionRecommendation.LOW, short_switch='u', choices=UNITS,
            help=_('The unit of measure for page sizes. Default is inch. Choices '
            'are {} '
            'Note: This does not override the unit for margins!').format(', '.join(UNITS))),
        OptionRecommendation(name='paper_size', recommended_value='letter',
            level=OptionRecommendation.LOW, choices=PAPER_SIZES,
            help=_('The size of the paper. This size will be overridden when a '
            'non default output profile is used. Default is letter. Choices '
            'are {}').format(', '.join(PAPER_SIZES))),
        OptionRecommendation(name='custom_size', recommended_value=None,
            help=_('Custom size of the document. Use the form widthxheight '
            'e.g. `123x321` to specify the width and height. '
            'This overrides any specified paper-size.')),
        OptionRecommendation(name='preserve_cover_aspect_ratio',
            recommended_value=False,
            help=_('Preserve the aspect ratio of the cover, instead'
                ' of stretching it to fill the full first page of the'
                ' generated PDF.')),
        OptionRecommendation(name='pdf_serif_family',
            recommended_value='Times', help=_(
                'The font family used to render serif fonts. Will work only if the font is available system-wide.')),
        OptionRecommendation(name='pdf_sans_family',
            recommended_value='Helvetica', help=_(
                'The font family used to render sans-serif fonts. Will work only if the font is available system-wide.')),
        OptionRecommendation(name='pdf_mono_family',
            recommended_value='Courier', help=_(
                'The font family used to render monospace fonts. Will work only if the font is available system-wide.')),
        OptionRecommendation(name='pdf_standard_font', choices=ui_data['font_types'],
            recommended_value='serif', help=_(
                'The font family used to render monospace fonts')),
        OptionRecommendation(name='pdf_default_font_size',
            recommended_value=20, help=_(
                'The default font size')),
        OptionRecommendation(name='pdf_mono_font_size',
            recommended_value=16, help=_(
                'The default font size for monospaced text')),
        OptionRecommendation(name='pdf_hyphenate', recommended_value=False,
            help=_('Break long words at the end of lines. This can give the text at the right margin a more even appearance.')),
        OptionRecommendation(name='pdf_mark_links', recommended_value=False,
            help=_('Surround all links with a red box, useful for debugging.')),
        OptionRecommendation(name='pdf_page_numbers', recommended_value=False,
            help=_('Add page numbers to the bottom of every page in the generated PDF file. If you '
                   'specify a footer template, it will take precedence '
                   'over this option.')),
        OptionRecommendation(name='pdf_footer_template', recommended_value=None,
            help=_('An HTML template used to generate %s on every page.'
                   ' The strings _PAGENUM_, _TITLE_, _AUTHOR_ and _SECTION_ will be replaced by their current values.')%_('footers')),
        OptionRecommendation(name='pdf_header_template', recommended_value=None,
            help=_('An HTML template used to generate %s on every page.'
                   ' The strings _PAGENUM_, _TITLE_, _AUTHOR_ and _SECTION_ will be replaced by their current values.')%_('headers')),
        OptionRecommendation(name='pdf_add_toc', recommended_value=False,
            help=_('Add a Table of Contents at the end of the PDF that lists page numbers. '
                   'Useful if you want to print out the PDF. If this PDF is intended for electronic use, use the PDF Outline instead.')),
        OptionRecommendation(name='toc_title', recommended_value=None,
            help=_('Title for generated table of contents.')
        ),

        OptionRecommendation(name='pdf_page_margin_left', recommended_value=72.0,
            level=OptionRecommendation.LOW,
            help=_('The size of the left page margin, in pts. Default is 72pt.'
                   ' Overrides the common left page margin setting.')
        ),

        OptionRecommendation(name='pdf_page_margin_top', recommended_value=72.0,
            level=OptionRecommendation.LOW,
            help=_('The size of the top page margin, in pts. Default is 72pt.'
                   ' Overrides the common top page margin setting, unless set to zero.')
        ),

        OptionRecommendation(name='pdf_page_margin_right', recommended_value=72.0,
            level=OptionRecommendation.LOW,
            help=_('The size of the right page margin, in pts. Default is 72pt.'
                   ' Overrides the common right page margin setting, unless set to zero.')
        ),

        OptionRecommendation(name='pdf_page_margin_bottom', recommended_value=72.0,
            level=OptionRecommendation.LOW,
            help=_('The size of the bottom page margin, in pts. Default is 72pt.'
                   ' Overrides the common bottom page margin setting, unless set to zero.')
        ),
        OptionRecommendation(name='pdf_use_document_margins', recommended_value=False,
            help=_('Use the page margins specified in the input document via @page CSS rules.'
            ' This will cause the margins specified in the conversion settings to be ignored.'
            ' If the document does not specify page margins, the conversion settings will be used as a fallback.')
        ),
        OptionRecommendation(name='pdf_page_number_map', recommended_value=None,
            help=_('Adjust page numbers, as needed. Syntax is a JavaScript expression for the page number.'
                ' For example, "if (n < 3) 0; else n - 3;", where n is current page number.')
        ),
        OptionRecommendation(name='uncompressed_pdf',
            recommended_value=False, help=_(
                'Generate an uncompressed PDF, useful for debugging.')
        ),
        OptionRecommendation(name='pdf_odd_even_offset', recommended_value=0.0,
            level=OptionRecommendation.LOW,
            help=_(
                'Shift the text horizontally by the specified offset (in pts).'
                ' On odd numbered pages, it is shifted to the right and on even'
                ' numbered pages to the left. Use negative numbers for the opposite'
                ' effect. Note that this setting is ignored on pages where the margins'
                ' are smaller than the specified offset. Shifting is done by setting'
                ' the PDF CropBox, not all software respects the CropBox.'
            )
        ),

    }

    def specialize_options(self, log, opts, input_fmt):
        # Ensure Qt is setup to be used with WebEngine
        # specialize_options is called early enough in the pipeline
        # that hopefully no Qt application has been constructed as yet
        from qt.webengine import QWebEngineUrlScheme
        from qt.webengine import QWebEnginePage  # noqa
        from calibre.gui2 import must_use_qt
        from calibre.constants import FAKE_PROTOCOL
        scheme = QWebEngineUrlScheme(FAKE_PROTOCOL.encode('ascii'))
        scheme.setSyntax(QWebEngineUrlScheme.Syntax.Host)
        scheme.setFlags(QWebEngineUrlScheme.Flag.SecureScheme)
        QWebEngineUrlScheme.registerScheme(scheme)
        must_use_qt()
        self.input_fmt = input_fmt

        if opts.pdf_use_document_margins:
            # Prevent the conversion pipeline from overwriting document margins
            opts.margin_left = opts.margin_right = opts.margin_top = opts.margin_bottom = -1

    def convert(self, oeb_book, output_path, input_plugin, opts, log):
        self.stored_page_margins = getattr(opts, '_stored_page_margins', {})

        self.oeb = oeb_book
        self.input_plugin, self.opts, self.log = input_plugin, opts, log
        self.output_path = output_path
        from calibre.ebooks.oeb.base import OPF, OPF2_NS
        from lxml import etree
        from io import BytesIO
        package = etree.Element(OPF('package'),
            attrib={'version': '2.0', 'unique-identifier': 'dummy'},
            nsmap={None: OPF2_NS})
        from calibre.ebooks.metadata.opf2 import OPF
        self.oeb.metadata.to_opf2(package)
        self.metadata = OPF(BytesIO(etree.tostring(package))).to_book_metadata()
        self.cover_data = None

        if input_plugin.is_image_collection:
            log.debug('Converting input as an image collection...')
            self.convert_images(input_plugin.get_images())
        else:
            log.debug('Converting input as a text based book...')
            self.convert_text(oeb_book)

    def convert_images(self, images):
        from calibre.ebooks.pdf.image_writer import convert
        convert(images, self.output_path, self.opts, self.metadata, self.report_progress)

    def get_cover_data(self):
        oeb = self.oeb
        if (oeb.metadata.cover and unicode_type(oeb.metadata.cover[0]) in oeb.manifest.ids):
            cover_id = unicode_type(oeb.metadata.cover[0])
            item = oeb.manifest.ids[cover_id]
            if isinstance(item.data, bytes):
                self.cover_data = item.data

    def process_fonts(self):
        ''' Make sure all fonts are embeddable '''
        from calibre.ebooks.oeb.base import urlnormalize
        from calibre.utils.fonts.utils import remove_embed_restriction

        processed = set()
        for item in list(self.oeb.manifest):
            if not hasattr(item.data, 'cssRules'):
                continue
            for i, rule in enumerate(item.data.cssRules):
                if rule.type == rule.FONT_FACE_RULE:
                    try:
                        s = rule.style
                        src = s.getProperty('src').propertyValue[0].uri
                    except:
                        continue
                    path = item.abshref(src)
                    ff = self.oeb.manifest.hrefs.get(urlnormalize(path), None)
                    if ff is None:
                        continue

                    raw = nraw = ff.data
                    if path not in processed:
                        processed.add(path)
                        try:
                            nraw = remove_embed_restriction(raw)
                        except:
                            continue
                        if nraw != raw:
                            ff.data = nraw
                            self.oeb.container.write(path, nraw)

    def convert_text(self, oeb_book):
        import json
        from calibre.ebooks.pdf.html_writer import convert
        self.get_cover_data()
        self.process_fonts()

        if self.opts.pdf_use_document_margins and self.stored_page_margins:
            for href, margins in iteritems(self.stored_page_margins):
                item = oeb_book.manifest.hrefs.get(href)
                if item is not None:
                    root = item.data
                    if hasattr(root, 'xpath') and margins:
                        root.set('data-calibre-pdf-output-page-margins', json.dumps(margins))

        with TemporaryDirectory('_pdf_out') as oeb_dir:
            from calibre.customize.ui import plugin_for_output_format
            oeb_dir = os.path.realpath(oeb_dir)
            oeb_output = plugin_for_output_format('oeb')
            oeb_output.convert(oeb_book, oeb_dir, self.input_plugin, self.opts, self.log)
            opfpath = glob.glob(os.path.join(oeb_dir, '*.opf'))[0]
            convert(
                opfpath, self.opts, metadata=self.metadata, output_path=self.output_path,
                log=self.log, cover_data=self.cover_data, report_progress=self.report_progress
            )
Esempio n. 24
0
class KEPUBInput(EPUBInput):

    name = 'KePub Input'
    description = 'Convert KEPUB files (.kepub) to HTML'
    author = 'David Forrester'
    file_types = set(['kepub'])
    version = plugin_version
    minimum_calibre_version = (0, 1, 0)

    options = {
        OptionRecommendation(
            name='strip_kobo_spans',
            recommended_value=True,
            help=
            _('Kepubs have spans wrapping each sentence. These are used by the ereader for the reading location '
              'and bookmark location. They are not used by an ePub reader but are valid code and can be safely be '
              'left in the ePub. If you plan to edit the ePub, it is recommended that you remove the spans.'
              )),
    }

    recommendations = set([])

    def gui_configuration_widget(self,
                                 parent,
                                 get_option_by_name,
                                 get_option_help,
                                 db,
                                 book_id=None):
        from calibre_plugins.kepubin.conversion.input_config import PluginWidget
        return PluginWidget(parent, get_option_by_name, get_option_help, db,
                            book_id)

    def convert(self, stream, options, file_ext, log, accelerators):
        log("KEPUBInput::convert - start")
        from calibre.utils.zipfile import ZipFile
        from calibre import walk
        from calibre.ebooks import DRMError
        from calibre.ebooks.metadata.opf2 import OPF
        try:
            zf = ZipFile(stream)
            zf.extractall(os.getcwdu())
        except:
            log.exception('KEPUB appears to be invalid ZIP file, trying a '
                          'more forgiving ZIP parser')
            from calibre.utils.localunzip import extractall
            stream.seek(0)
            extractall(stream)
        opf = self.find_opf()
        if opf is None:
            for f in walk(u'.'):
                if f.lower().endswith('.opf') and '__MACOSX' not in f and \
                        not os.path.basename(f).startswith('.'):
                    opf = os.path.abspath(f)
                    break
        path = getattr(stream, 'name', 'stream')

        if opf is None:
            raise ValueError(
                _('%s is not a valid KEPUB file (could not find opf)') % path)

        encfile = os.path.abspath('rights.xml')
        if os.path.exists(encfile):
            raise DRMError(os.path.basename(path))

        opf = os.path.relpath(opf, os.getcwdu())
        parts = os.path.split(opf)
        opf = OPF(opf, os.path.dirname(os.path.abspath(opf)))

        self.encrypted_fonts = []

        if len(parts) > 1 and parts[0]:
            delta = '/'.join(parts[:-1]) + '/'
            for elem in opf.itermanifest():
                elem.set('href', delta + elem.get('href'))
            for elem in opf.iterguide():
                elem.set('href', delta + elem.get('href'))

        f = self.rationalize_cover3 if opf.package_version >= 3.0 else \
            self.rationalize_cover2
        self.removed_cover = f(opf, log)

        self.optimize_opf_parsing = opf
        for x in opf.itermanifest():
            if x.get('media-type', '') == 'application/x-dtbook+xml':
                raise ValueError(
                    _('EPUB files with DTBook markup are not supported'))

        not_for_spine = set()
        for y in opf.itermanifest():
            id_ = y.get('id', None)
            if id_ and y.get('media-type', None) in {
                    'application/vnd.adobe-page-template+xml',
                    'application/vnd.adobe.page-template+xml',
                    'application/adobe-page-template+xml',
                    'application/adobe.page-template+xml', 'application/text'
            }:
                not_for_spine.add(id_)

        seen = set()
        for x in list(opf.iterspine()):
            ref = x.get('idref', None)
            if not ref or ref in not_for_spine or ref in seen:
                x.getparent().remove(x)
                continue
            seen.add(ref)

        if len(list(opf.iterspine())) == 0:
            raise ValueError(_('No valid entries in the spine of this EPUB'))

        with open('content.opf', 'wb') as nopf:
            nopf.write(opf.render())

        return os.path.abspath(u'content.opf')

    def postprocess_book(self, oeb, opts, log):
        log("KEPUBInput::postprocess_book - start")
        from calibre.ebooks.oeb.base import XHTML_NS

        # The Kobo spans wrap each sentence. Remove them and add their text to
        # the parent tag.
        def refactor_span(a):
            p = a.getparent()
            idx = p.index(a) - 1
            p.remove(a)

            if idx < 0:
                if p.text is None:
                    p.text = ''
                p.text += a.text if a.text else ''
                p.text += a.tail if a.tail else ''
            else:
                if p[idx].tail is None:
                    p[idx].tail = ''
                p[idx].tail += a.text if a.text else ''
                p[idx].tail += a.tail if a.tail else ''

        super(KEPUBInput, self).postprocess_book(oeb, opts, log)
        if not opts.strip_kobo_spans:
            log("KEPUBInput::postprocess_book - not stripping kobo spans")
            return

        for item in oeb.spine:
            log("item.__class__.__name__", item.__class__.__name__)
            if not hasattr(item.data, 'xpath'):
                continue

            for a in item.data.xpath('//h:span[@class="koboSpan"]',
                                     namespaces={'h': XHTML_NS}):
                refactor_span(a)

        log("KEPUBInput::postprocess_book - end")

    def process_encryption(self, encfile, opf, log):
        # Shouldn't get called, but overriding just in case.
        encfile = os.path.abspath('rights.xml')
        return not os.path.exists(encfile)
Esempio n. 25
0
class HTMLZOutput(OutputFormatPlugin):

    name = 'HTMLZ Output'
    author = 'John Schember'
    file_type = 'htmlz'
    commit_name = 'htmlz_output'
    ui_data = {
        'css_choices': {
            'class': _('Use CSS classes'),
            'inline': _('Use the style attribute'),
            'tag': _('Use HTML tags wherever possible')
        },
        'sheet_choices': {
            'external': _('Use an external CSS file'),
            'inline': _('Use a <style> tag in the HTML file')
        }
    }

    options = {
        OptionRecommendation(
            name='htmlz_css_type',
            recommended_value='class',
            level=OptionRecommendation.LOW,
            choices=list(ui_data['css_choices']),
            help=_('Specify the handling of CSS. Default is class.\n'
                   'class: {class}\n'
                   'inline: {inline}\n'
                   'tag: {tag}').format(**ui_data['css_choices'])),
        OptionRecommendation(
            name='htmlz_class_style',
            recommended_value='external',
            level=OptionRecommendation.LOW,
            choices=list(ui_data['sheet_choices']),
            help=_('How to handle the CSS when using css-type = \'class\'.\n'
                   'Default is external.\n'
                   'external: {external}\n'
                   'inline: {inline}').format(**ui_data['sheet_choices'])),
        OptionRecommendation(
            name='htmlz_title_filename',
            recommended_value=False,
            level=OptionRecommendation.LOW,
            help=_(
                'If set this option causes the file name of the HTML file'
                ' inside the HTMLZ archive to be based on the book title.')),
    }

    def convert(self, oeb_book, output_path, input_plugin, opts, log):
        from lxml import etree
        from calibre.ebooks.oeb.base import OEB_IMAGES, SVG_MIME
        from calibre.ebooks.metadata.opf2 import OPF, metadata_to_opf
        from calibre.utils.zipfile import ZipFile
        from calibre.utils.filenames import ascii_filename

        # HTML
        if opts.htmlz_css_type == 'inline':
            from calibre.ebooks.htmlz.oeb2html import OEB2HTMLInlineCSSizer
            OEB2HTMLizer = OEB2HTMLInlineCSSizer
        elif opts.htmlz_css_type == 'tag':
            from calibre.ebooks.htmlz.oeb2html import OEB2HTMLNoCSSizer
            OEB2HTMLizer = OEB2HTMLNoCSSizer
        else:
            from calibre.ebooks.htmlz.oeb2html import OEB2HTMLClassCSSizer as OEB2HTMLizer

        with TemporaryDirectory(u'_htmlz_output') as tdir:
            htmlizer = OEB2HTMLizer(log)
            html = htmlizer.oeb2html(oeb_book, opts)

            fname = u'index'
            if opts.htmlz_title_filename:
                from calibre.utils.filenames import shorten_components_to
                fname = shorten_components_to(100, (ascii_filename(
                    unicode_type(oeb_book.metadata.title[0])), ))[0]
            with open(os.path.join(tdir, fname + u'.html'), 'wb') as tf:
                if isinstance(html, unicode_type):
                    html = html.encode('utf-8')
                tf.write(html)

            # CSS
            if opts.htmlz_css_type == 'class' and opts.htmlz_class_style == 'external':
                with open(os.path.join(tdir, u'style.css'), 'wb') as tf:
                    tf.write(htmlizer.get_css(oeb_book))

            # Images
            images = htmlizer.images
            if images:
                if not os.path.exists(os.path.join(tdir, u'images')):
                    os.makedirs(os.path.join(tdir, u'images'))
                for item in oeb_book.manifest:
                    if item.media_type in OEB_IMAGES and item.href in images:
                        if item.media_type == SVG_MIME:
                            data = unicode_type(
                                etree.tostring(item.data,
                                               encoding=unicode_type))
                        else:
                            data = item.data
                        fname = os.path.join(tdir, u'images',
                                             images[item.href])
                        with open(fname, 'wb') as img:
                            img.write(data)

            # Cover
            cover_path = None
            try:
                cover_data = None
                if oeb_book.metadata.cover:
                    term = oeb_book.metadata.cover[0].term
                    cover_data = oeb_book.guide[term].item.data
                if cover_data:
                    from calibre.utils.img import save_cover_data_to
                    cover_path = os.path.join(tdir, u'cover.jpg')
                    with lopen(cover_path, 'w') as cf:
                        cf.write('')
                    save_cover_data_to(cover_data, cover_path)
            except:
                import traceback
                traceback.print_exc()

            # Metadata
            with open(os.path.join(tdir, u'metadata.opf'), 'wb') as mdataf:
                opf = OPF(
                    io.BytesIO(
                        etree.tostring(oeb_book.metadata.to_opf1(),
                                       encoding='UTF-8')))
                mi = opf.to_book_metadata()
                if cover_path:
                    mi.cover = u'cover.jpg'
                mdataf.write(metadata_to_opf(mi))

            htmlz = ZipFile(output_path, 'w')
            htmlz.add_dir(tdir)
Esempio n. 26
0
class TXTOutput(OutputFormatPlugin):

    name = 'TXT Output'
    author = 'John Schember'
    file_type = 'txt'
    commit_name = 'txt_output'
    ui_data = {
        'newline_types': NEWLINE_TYPES,
        'formatting_types': {
            'plain': _('Plain text'),
            'markdown': _('Markdown formatted text'),
            'textile': _('TexTile formatted text')
        },
    }

    options = {
        OptionRecommendation(
            name='newline',
            recommended_value='system',
            level=OptionRecommendation.LOW,
            short_switch='n',
            choices=NEWLINE_TYPES,
            help=
            _('Type of newline to use. Options are %s. Default is \'system\'. '
              'Use \'old_mac\' for compatibility with Mac OS 9 and earlier. '
              'For macOS use \'unix\'. \'system\' will default to the newline '
              'type used by this OS.') % sorted(NEWLINE_TYPES)),
        OptionRecommendation(
            name='txt_output_encoding',
            recommended_value='utf-8',
            level=OptionRecommendation.LOW,
            help=_('Specify the character encoding of the output document. '
                   'The default is utf-8.')),
        OptionRecommendation(
            name='inline_toc',
            recommended_value=False,
            level=OptionRecommendation.LOW,
            help=_('Add Table of Contents to beginning of the book.')),
        OptionRecommendation(
            name='max_line_length',
            recommended_value=0,
            level=OptionRecommendation.LOW,
            help=
            _('The maximum number of characters per line. This splits on '
              'the first space before the specified value. If no space is found '
              'the line will be broken at the space after and will exceed the '
              'specified value. Also, there is a minimum of 25 characters. '
              'Use 0 to disable line splitting.')),
        OptionRecommendation(
            name='force_max_line_length',
            recommended_value=False,
            level=OptionRecommendation.LOW,
            help=_(
                'Force splitting on the max-line-length value when no space '
                'is present. Also allows max-line-length to be below the minimum'
            )),
        OptionRecommendation(name='txt_output_formatting',
                             recommended_value='plain',
                             choices=list(ui_data['formatting_types']),
                             help=_('Formatting used within the document.\n'
                                    '* plain: {plain}\n'
                                    '* markdown: {markdown}\n'
                                    '* textile: {textile}').format(
                                        **ui_data['formatting_types'])),
        OptionRecommendation(
            name='keep_links',
            recommended_value=False,
            level=OptionRecommendation.LOW,
            help=_(
                'Do not remove links within the document. This is only '
                'useful when paired with a TXT output formatting option that '
                'is not none because links are always removed with plain text output.'
            )),
        OptionRecommendation(
            name='keep_image_references',
            recommended_value=False,
            level=OptionRecommendation.LOW,
            help=
            _('Do not remove image references within the document. This is only '
              'useful when paired with a TXT output formatting option that '
              'is not none because links are always removed with plain text output.'
              )),
        OptionRecommendation(
            name='keep_color',
            recommended_value=False,
            level=OptionRecommendation.LOW,
            help=
            _('Do not remove font color from output. This is only useful when '
              'TXT output formatting is set to textile. Textile is the only '
              'formatting that supports setting font color. If this option is '
              'not specified font color will not be set and default to the '
              'color displayed by the reader (generally this is black).')),
    }

    def convert(self, oeb_book, output_path, input_plugin, opts, log):
        from calibre.ebooks.txt.txtml import TXTMLizer
        from calibre.utils.cleantext import clean_ascii_chars
        from calibre.ebooks.txt.newlines import specified_newlines, TxtNewlines

        if opts.txt_output_formatting.lower() == 'markdown':
            from calibre.ebooks.txt.markdownml import MarkdownMLizer
            self.writer = MarkdownMLizer(log)
        elif opts.txt_output_formatting.lower() == 'textile':
            from calibre.ebooks.txt.textileml import TextileMLizer
            self.writer = TextileMLizer(log)
        else:
            self.writer = TXTMLizer(log)

        txt = self.writer.extract_content(oeb_book, opts)
        txt = clean_ascii_chars(txt)

        log.debug('\tReplacing newlines with selected type...')
        txt = specified_newlines(TxtNewlines(opts.newline).newline, txt)

        close = False
        if not hasattr(output_path, 'write'):
            close = True
            if not os.path.exists(os.path.dirname(
                    output_path)) and os.path.dirname(output_path) != '':
                os.makedirs(os.path.dirname(output_path))
            out_stream = open(output_path, 'wb')
        else:
            out_stream = output_path

        out_stream.seek(0)
        out_stream.truncate()
        out_stream.write(txt.encode(opts.txt_output_encoding, 'replace'))

        if close:
            out_stream.close()
Esempio n. 27
0
class HTMLOutput(OutputFormatPlugin):

    name = 'HTML Output'
    author = 'Fabian Grassl'
    file_type = 'zip'
    commit_name = 'html_output'

    options = {
        OptionRecommendation(
            name='template_css',
            help=_(
                'CSS file used for the output instead of the default file')),
        OptionRecommendation(
            name='template_html_index',
            help=
            _('Template used for generation of the HTML index file instead of the default file'
              )),
        OptionRecommendation(
            name='template_html',
            help=
            _('Template used for the generation of the HTML contents of the book instead of the default file'
              )),
        OptionRecommendation(
            name='extract_to',
            help=_(
                'Extract the contents of the generated ZIP file to the '
                'specified directory. WARNING: The contents of the directory '
                'will be deleted.')),
    }

    recommendations = {('pretty_print', True, OptionRecommendation.HIGH)}

    def generate_toc(self, oeb_book, ref_url, output_dir):
        '''
        Generate table of contents
        '''
        from lxml import etree
        from urllib import unquote

        from calibre.ebooks.oeb.base import element
        from calibre.utils.cleantext import clean_xml_chars
        with CurrentDir(output_dir):

            def build_node(current_node, parent=None):
                if parent is None:
                    parent = etree.Element('ul')
                elif len(current_node.nodes):
                    parent = element(parent, ('ul'))
                for node in current_node.nodes:
                    point = element(parent, 'li')
                    href = relpath(abspath(unquote(node.href)),
                                   dirname(ref_url))
                    if isinstance(href, bytes):
                        href = href.decode('utf-8')
                    link = element(point, 'a', href=clean_xml_chars(href))
                    title = node.title
                    if isinstance(title, bytes):
                        title = title.decode('utf-8')
                    if title:
                        title = re.sub(r'\s+', ' ', title)
                    link.text = clean_xml_chars(title)
                    build_node(node, point)
                return parent

            wrap = etree.Element('div')
            wrap.append(build_node(oeb_book.toc))
            return wrap

    def generate_html_toc(self, oeb_book, ref_url, output_dir):
        from lxml import etree

        root = self.generate_toc(oeb_book, ref_url, output_dir)
        return etree.tostring(root,
                              pretty_print=True,
                              encoding='utf-8',
                              xml_declaration=False)

    def convert(self, oeb_book, output_path, input_plugin, opts, log):
        from lxml import etree
        from calibre.utils import zipfile
        from templite import Templite
        from urllib import unquote
        from calibre.ebooks.html.meta import EasyMeta

        # read template files
        if opts.template_html_index is not None:
            template_html_index_data = open(opts.template_html_index,
                                            'rb').read()
        else:
            template_html_index_data = P(
                'templates/html_export_default_index.tmpl', data=True)

        if opts.template_html is not None:
            template_html_data = open(opts.template_html, 'rb').read()
        else:
            template_html_data = P('templates/html_export_default.tmpl',
                                   data=True)

        if opts.template_css is not None:
            template_css_data = open(opts.template_css, 'rb').read()
        else:
            template_css_data = P('templates/html_export_default.css',
                                  data=True)

        template_html_index_data = template_html_index_data.decode('utf-8')
        template_html_data = template_html_data.decode('utf-8')
        template_css_data = template_css_data.decode('utf-8')

        self.log = log
        self.opts = opts
        meta = EasyMeta(oeb_book.metadata)

        tempdir = os.path.realpath(PersistentTemporaryDirectory())
        output_file = os.path.join(
            tempdir, basename(re.sub(r'\.zip', '', output_path) + '.html'))
        output_dir = re.sub(r'\.html', '', output_file) + '_files'

        if not exists(output_dir):
            os.makedirs(output_dir)

        css_path = output_dir + os.sep + 'calibreHtmlOutBasicCss.css'
        with open(css_path, 'wb') as f:
            f.write(template_css_data.encode('utf-8'))

        with open(output_file, 'wb') as f:
            html_toc = self.generate_html_toc(oeb_book, output_file,
                                              output_dir)
            templite = Templite(template_html_index_data)
            nextLink = oeb_book.spine[0].href
            nextLink = relpath(output_dir + os.sep + nextLink,
                               dirname(output_file))
            cssLink = relpath(abspath(css_path), dirname(output_file))
            tocUrl = relpath(output_file, dirname(output_file))
            t = templite.render(has_toc=bool(oeb_book.toc.count()),
                                toc=html_toc,
                                meta=meta,
                                nextLink=nextLink,
                                tocUrl=tocUrl,
                                cssLink=cssLink,
                                firstContentPageLink=nextLink)
            if isinstance(t, unicode_type):
                t = t.encode('utf-8')
            f.write(t)

        with CurrentDir(output_dir):
            for item in oeb_book.manifest:
                path = abspath(unquote(item.href))
                dir = dirname(path)
                if not exists(dir):
                    os.makedirs(dir)
                if item.spine_position is not None:
                    with open(path, 'wb') as f:
                        pass
                else:
                    with open(path, 'wb') as f:
                        f.write(str(item))
                    item.unload_data_from_memory(memory=path)

            for item in oeb_book.spine:
                path = abspath(unquote(item.href))
                dir = dirname(path)
                root = item.data.getroottree()

                # get & clean HTML <HEAD>-data
                head = root.xpath(
                    '//h:head',
                    namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
                head_content = etree.tostring(head,
                                              pretty_print=True,
                                              encoding='utf-8')
                head_content = re.sub(r'\<\/?head.*\>', '', head_content)
                head_content = re.sub(
                    re.compile(r'\<style.*\/style\>', re.M | re.S), '',
                    head_content)
                head_content = re.sub(r'<(title)([^>]*)/>', r'<\1\2></\1>',
                                      head_content)

                # get & clean HTML <BODY>-data
                body = root.xpath(
                    '//h:body',
                    namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
                ebook_content = etree.tostring(body,
                                               pretty_print=True,
                                               encoding='utf-8')
                ebook_content = re.sub(r'\<\/?body.*\>', '', ebook_content)
                ebook_content = re.sub(r'<(div|a|span)([^>]*)/>',
                                       r'<\1\2></\1>', ebook_content)

                # generate link to next page
                if item.spine_position + 1 < len(oeb_book.spine):
                    nextLink = oeb_book.spine[item.spine_position + 1].href
                    nextLink = relpath(abspath(nextLink), dir)
                else:
                    nextLink = None

                # generate link to previous page
                if item.spine_position > 0:
                    prevLink = oeb_book.spine[item.spine_position - 1].href
                    prevLink = relpath(abspath(prevLink), dir)
                else:
                    prevLink = None

                cssLink = relpath(abspath(css_path), dir)
                tocUrl = relpath(output_file, dir)
                firstContentPageLink = oeb_book.spine[0].href

                # render template
                templite = Templite(template_html_data)
                toc = lambda: self.generate_html_toc(oeb_book, path, output_dir
                                                     )
                t = templite.render(ebookContent=ebook_content,
                                    prevLink=prevLink,
                                    nextLink=nextLink,
                                    has_toc=bool(oeb_book.toc.count()),
                                    toc=toc,
                                    tocUrl=tocUrl,
                                    head_content=head_content,
                                    meta=meta,
                                    cssLink=cssLink,
                                    firstContentPageLink=firstContentPageLink)

                # write html to file
                with open(path, 'wb') as f:
                    f.write(t)
                item.unload_data_from_memory(memory=path)

        zfile = zipfile.ZipFile(output_path, "w")
        zfile.add_dir(output_dir, basename(output_dir))
        zfile.write(output_file, basename(output_file), zipfile.ZIP_DEFLATED)

        if opts.extract_to:
            if os.path.exists(opts.extract_to):
                shutil.rmtree(opts.extract_to)
            os.makedirs(opts.extract_to)
            zfile.extractall(opts.extract_to)
            self.log('Zip file extracted to', opts.extract_to)

        zfile.close()

        # cleanup temp dir
        shutil.rmtree(tempdir)
Esempio n. 28
0
class KEPubOutput(OutputFormatPlugin):
    name = 'KePub Output'
    author = 'Joel Goguen'
    file_type = 'kepub'
    version = plugin_version
    minimum_calibre_version = plugin_minimum_calibre_version

    epub_output_plugin = None
    configdir = os.path.join(config_dir, 'plugins')
    reference_kepub = os.path.join(configdir, 'reference.kepub.epub')
    options = set([
        OptionRecommendation(
            name='kepub_hyphenate',
            recommended_value=True,
            help=
            'Select this to add a CSS file which enables hyphenation. The language used will be the language defined for the book in calibre. Please see the README file for directions on updating hyphenation dictionaries.'
        ),
        OptionRecommendation(
            name='kepub_replace_lang',
            recommended_value=True,
            help=
            'Select this to replace the defined language in each content file inside the ePub.'
        ),
        OptionRecommendation(
            name='kepub_clean_markup',
            recommended_value=True,
            help='Select this to clean up the internal ePub markup.')
    ])
    recommendations = set([])

    def __init__(self, *args, **kwargs):
        self.epub_output_plugin = EPUBOutput(*args, **kwargs)
        self.options = self.options.union(self.epub_output_plugin.options)
        self.recommendations = self.recommendations.union(
            self.epub_output_plugin.recommendations)
        OutputFormatPlugin.__init__(self, *args, **kwargs)

    def gui_configuration_widget(self,
                                 parent,
                                 get_option_by_name,
                                 get_option_help,
                                 db,
                                 book_id=None):
        from calibre_plugins.koboconversion.conversion.config import PluginWidget
        return PluginWidget(parent, get_option_by_name, get_option_help, db,
                            book_id)

    def convert(self, oeb_book, output, input_plugin, opts, log):
        self.epub_output_plugin.convert(oeb_book, output, input_plugin, opts,
                                        log)
        container = KEPubContainer(output, default_log)

        if container.is_drm_encumbered:
            return

        # Write the details file
        o = {
            'kepub_output_version': ".".join([str(n) for n in self.version]),
            'kepub_output_currenttime': datetime.utcnow().ctime()
        }
        kte_data_file = self.temporary_file('_KePubOutputPluginInfo')
        kte_data_file.write(json.dumps(o))
        kte_data_file.close()
        container.copy_file_to_container(kte_data_file.name,
                                         name='plugininfo.kte',
                                         mt='application/json')

        title = container.opf_xpath("./opf:metadata/dc:title/text()")
        if len(title) > 0:
            title = title[0]
        else:
            title = NULL_VALUES['title']
        authors = container.opf_xpath(
            './opf:metadata/dc:creator[@opf:role="aut"]/text()')
        if len(authors) < 1:
            authors = NULL_VALUES['authors']
        mi = Metadata(title, authors)
        language = container.opf_xpath("./opf:metadata/dc:language/text()")
        if len(language) > 0:
            mi.languages = language
            language = language[0]
        else:
            mi.languages = NULL_VALUES['languages']
            language = NULL_VALUES['language']
        mi.language

        modify_epub(container,
                    output,
                    metadata=mi,
                    opts={
                        'clean_markup': opts.kepub_clean_markup,
                        'hyphenate': opts.kepub_hyphenate,
                        'replace_lang': opts.kepub_replace_lang,
                        'smarten_punctuation': False,
                        'extended_kepub_features': True
                    })
Esempio n. 29
0
class HTMLInput(InputFormatPlugin):

    name = 'HTML Input'
    author = 'Kovid Goyal'
    description = 'Convert HTML and OPF files to an OEB'
    file_types = {'opf', 'html', 'htm', 'xhtml', 'xhtm', 'shtm', 'shtml'}
    commit_name = 'html_input'

    options = {
        OptionRecommendation(
            name='breadth_first',
            recommended_value=False,
            level=OptionRecommendation.LOW,
            help=_('Traverse links in HTML files breadth first. Normally, '
                   'they are traversed depth first.')),
        OptionRecommendation(
            name='max_levels',
            recommended_value=5,
            level=OptionRecommendation.LOW,
            help=_('Maximum levels of recursion when following links in '
                   'HTML files. Must be non-negative. 0 implies that no '
                   'links in the root HTML file are followed. Default is '
                   '%default.')),
        OptionRecommendation(
            name='dont_package',
            recommended_value=False,
            level=OptionRecommendation.LOW,
            help=_(
                'Normally this input plugin re-arranges all the input '
                'files into a standard folder hierarchy. Only use this option '
                'if you know what you are doing as it can result in various '
                'nasty side effects in the rest of the conversion pipeline.')),
    }

    def convert(self, stream, opts, file_ext, log, accelerators):
        self._is_case_sensitive = None
        basedir = getcwd()
        self.opts = opts

        fname = None
        if hasattr(stream, 'name'):
            basedir = os.path.dirname(stream.name)
            fname = os.path.basename(stream.name)

        if file_ext != 'opf':
            if opts.dont_package:
                raise ValueError(
                    'The --dont-package option is not supported for an HTML input file'
                )
            from calibre.ebooks.metadata.html import get_metadata
            mi = get_metadata(stream)
            if fname:
                from calibre.ebooks.metadata.meta import metadata_from_filename
                fmi = metadata_from_filename(fname)
                fmi.smart_update(mi)
                mi = fmi
            oeb = self.create_oebbook(stream.name, basedir, opts, log, mi)
            return oeb

        from calibre.ebooks.conversion.plumber import create_oebbook
        return create_oebbook(log,
                              stream.name,
                              opts,
                              encoding=opts.input_encoding)

    def is_case_sensitive(self, path):
        if getattr(self, '_is_case_sensitive', None) is not None:
            return self._is_case_sensitive
        if not path or not os.path.exists(path):
            return islinux or isbsd
        self._is_case_sensitive = not (os.path.exists(path.lower())
                                       and os.path.exists(path.upper()))
        return self._is_case_sensitive

    def create_oebbook(self, htmlpath, basedir, opts, log, mi):
        import uuid
        from calibre.ebooks.conversion.plumber import create_oebbook
        from calibre.ebooks.oeb.base import (DirContainer, rewrite_links,
                                             urlnormalize, urldefrag,
                                             BINARY_MIME, OEB_STYLES, xpath,
                                             urlquote)
        from calibre import guess_type
        from calibre.ebooks.oeb.transforms.metadata import \
            meta_info_to_oeb_metadata
        from calibre.ebooks.html.input import get_filelist
        from calibre.ebooks.metadata import string_to_authors
        from calibre.utils.localization import canonicalize_lang
        import css_parser, logging
        css_parser.log.setLevel(logging.WARN)
        self.OEB_STYLES = OEB_STYLES
        oeb = create_oebbook(log,
                             None,
                             opts,
                             self,
                             encoding=opts.input_encoding,
                             populate=False)
        self.oeb = oeb

        metadata = oeb.metadata
        meta_info_to_oeb_metadata(mi, metadata, log)
        if not metadata.language:
            l = canonicalize_lang(getattr(opts, 'language', None))
            if not l:
                oeb.logger.warn('Language not specified')
                l = get_lang().replace('_', '-')
            metadata.add('language', l)
        if not metadata.creator:
            a = getattr(opts, 'authors', None)
            if a:
                a = string_to_authors(a)
            if not a:
                oeb.logger.warn('Creator not specified')
                a = [self.oeb.translate(__('Unknown'))]
            for aut in a:
                metadata.add('creator', aut)
        if not metadata.title:
            oeb.logger.warn('Title not specified')
            metadata.add('title', self.oeb.translate(__('Unknown')))
        bookid = unicode_type(uuid.uuid4())
        metadata.add('identifier', bookid, id='uuid_id', scheme='uuid')
        for ident in metadata.identifier:
            if 'id' in ident.attrib:
                self.oeb.uid = metadata.identifier[0]
                break

        filelist = get_filelist(htmlpath, basedir, opts, log)
        filelist = [f for f in filelist if not f.is_binary]
        htmlfile_map = {}
        for f in filelist:
            path = f.path
            oeb.container = DirContainer(os.path.dirname(path),
                                         log,
                                         ignore_opf=True)
            bname = os.path.basename(path)
            id, href = oeb.manifest.generate(id='html',
                                             href=sanitize_file_name(bname))
            htmlfile_map[path] = href
            item = oeb.manifest.add(id, href, 'text/html')
            if path == htmlpath and '%' in path:
                bname = urlquote(bname)
            item.html_input_href = bname
            oeb.spine.add(item, True)

        self.added_resources = {}
        self.log = log
        self.log('Normalizing filename cases')
        for path, href in htmlfile_map.items():
            if not self.is_case_sensitive(path):
                path = path.lower()
            self.added_resources[path] = href
        self.urlnormalize, self.DirContainer = urlnormalize, DirContainer
        self.urldefrag = urldefrag
        self.guess_type, self.BINARY_MIME = guess_type, BINARY_MIME

        self.log('Rewriting HTML links')
        for f in filelist:
            path = f.path
            dpath = os.path.dirname(path)
            oeb.container = DirContainer(dpath, log, ignore_opf=True)
            href = htmlfile_map[path]
            try:
                item = oeb.manifest.hrefs[href]
            except KeyError:
                item = oeb.manifest.hrefs[urlnormalize(href)]
            rewrite_links(item.data, partial(self.resource_adder, base=dpath))

        for item in oeb.manifest.values():
            if item.media_type in self.OEB_STYLES:
                dpath = None
                for path, href in self.added_resources.items():
                    if href == item.href:
                        dpath = os.path.dirname(path)
                        break
                css_parser.replaceUrls(
                    item.data, partial(self.resource_adder, base=dpath))

        toc = self.oeb.toc
        self.oeb.auto_generated_toc = True
        titles = []
        headers = []
        for item in self.oeb.spine:
            if not item.linear:
                continue
            html = item.data
            title = ''.join(xpath(html, '/h:html/h:head/h:title/text()'))
            title = re.sub(r'\s+', ' ', title.strip())
            if title:
                titles.append(title)
            headers.append('(unlabled)')
            for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'):
                expr = '/h:html/h:body//h:%s[position()=1]/text()'
                header = ''.join(xpath(html, expr % tag))
                header = re.sub(r'\s+', ' ', header.strip())
                if header:
                    headers[-1] = header
                    break
        use = titles
        if len(titles) > len(set(titles)):
            use = headers
        for title, item in zip(use, self.oeb.spine):
            if not item.linear:
                continue
            toc.add(title, item.href)

        oeb.container = DirContainer(getcwd(), oeb.log, ignore_opf=True)
        return oeb

    def link_to_local_path(self, link_, base=None):
        from calibre.ebooks.html.input import Link
        if not isinstance(link_, unicode_type):
            try:
                link_ = link_.decode('utf-8', 'error')
            except:
                self.log.warn('Failed to decode link %r. Ignoring' % link_)
                return None, None
        try:
            l = Link(link_, base if base else getcwd())
        except:
            self.log.exception('Failed to process link: %r' % link_)
            return None, None
        if l.path is None:
            # Not a local resource
            return None, None
        link = l.path.replace('/', os.sep).strip()
        frag = l.fragment
        if not link:
            return None, None
        return link, frag

    def resource_adder(self, link_, base=None):
        from polyglot.urllib import quote
        link, frag = self.link_to_local_path(link_, base=base)
        if link is None:
            return link_
        try:
            if base and not os.path.isabs(link):
                link = os.path.join(base, link)
            link = os.path.abspath(link)
        except:
            return link_
        if not os.access(link, os.R_OK):
            return link_
        if os.path.isdir(link):
            self.log.warn(link_, 'is a link to a directory. Ignoring.')
            return link_
        if not self.is_case_sensitive(tempfile.gettempdir()):
            link = link.lower()
        if link not in self.added_resources:
            bhref = os.path.basename(link)
            id, href = self.oeb.manifest.generate(
                id='added', href=sanitize_file_name(bhref))
            guessed = self.guess_type(href)[0]
            media_type = guessed or self.BINARY_MIME
            if media_type == 'text/plain':
                self.log.warn('Ignoring link to text file %r' % link_)
                return None
            if media_type == self.BINARY_MIME:
                # Check for the common case, images
                try:
                    img = what(link)
                except EnvironmentError:
                    pass
                else:
                    if img:
                        media_type = self.guess_type(
                            'dummy.' + img)[0] or self.BINARY_MIME

            self.oeb.log.debug('Added', link)
            self.oeb.container = self.DirContainer(os.path.dirname(link),
                                                   self.oeb.log,
                                                   ignore_opf=True)
            # Load into memory
            item = self.oeb.manifest.add(id, href, media_type)
            # bhref refers to an already existing file. The read() method of
            # DirContainer will call unquote on it before trying to read the
            # file, therefore we quote it here.
            if isinstance(bhref, unicode_type):
                bhref = bhref.encode('utf-8')
            item.html_input_href = as_unicode(quote(bhref))
            if guessed in self.OEB_STYLES:
                item.override_css_fetch = partial(self.css_import_handler,
                                                  os.path.dirname(link))
            item.data
            self.added_resources[link] = href

        nlink = self.added_resources[link]
        if frag:
            nlink = '#'.join((nlink, frag))
        return nlink

    def css_import_handler(self, base, href):
        link, frag = self.link_to_local_path(href, base=base)
        if link is None or not os.access(link, os.R_OK) or os.path.isdir(link):
            return None, None
        try:
            with open(link, 'rb') as f:
                raw = f.read().decode('utf-8', 'replace')
            raw = self.oeb.css_preprocessor(raw, add_namespace=False)
        except:
            self.log.exception('Failed to read CSS file: %r' % link)
            return None, None
        return None, raw
Esempio n. 30
0
class RecipeInput(InputFormatPlugin):

    name = 'Recipe Input'
    author = 'Kovid Goyal'
    description = _('Download periodical content from the internet')
    file_types = {'recipe', 'downloaded_recipe'}
    commit_name = 'recipe_input'

    recommendations = {
        ('chapter', None, OptionRecommendation.HIGH),
        ('dont_split_on_page_breaks', True, OptionRecommendation.HIGH),
        ('use_auto_toc', False, OptionRecommendation.HIGH),
        ('input_encoding', None, OptionRecommendation.HIGH),
        ('input_profile', 'default', OptionRecommendation.HIGH),
        ('page_breaks_before', None, OptionRecommendation.HIGH),
        ('insert_metadata', False, OptionRecommendation.HIGH),
    }

    options = {
        OptionRecommendation(
            name='test',
            recommended_value=False,
            help=
            _('Useful for recipe development. Forces'
              ' max_articles_per_feed to 2 and downloads at most 2 feeds.'
              ' You can change the number of feeds and articles by supplying optional arguments.'
              ' For example: --test 3 1 will download at most 3 feeds and only 1 article per feed.'
              )),
        OptionRecommendation(
            name='username',
            recommended_value=None,
            help=_('Username for sites that require a login to access '
                   'content.')),
        OptionRecommendation(
            name='password',
            recommended_value=None,
            help=_('Password for sites that require a login to access '
                   'content.')),
        OptionRecommendation(
            name='dont_download_recipe',
            recommended_value=False,
            help=
            _('Do not download latest version of builtin recipes from the calibre server'
              )),
        OptionRecommendation(
            name='lrf',
            recommended_value=False,
            help='Optimize fetching for subsequent conversion to LRF.'),
    }

    def convert(self, recipe_or_file, opts, file_ext, log, accelerators):
        from calibre.web.feeds.recipes import compile_recipe
        opts.output_profile.flow_size = 0
        if file_ext == 'downloaded_recipe':
            from calibre.utils.zipfile import ZipFile
            zf = ZipFile(recipe_or_file, 'r')
            zf.extractall()
            zf.close()
            with lopen('download.recipe', 'rb') as f:
                self.recipe_source = f.read()
            recipe = compile_recipe(self.recipe_source)
            recipe.needs_subscription = False
            self.recipe_object = recipe(opts, log, self.report_progress)
        else:
            if os.environ.get('CALIBRE_RECIPE_URN'):
                from calibre.web.feeds.recipes.collection import get_custom_recipe, get_builtin_recipe_by_id
                urn = os.environ['CALIBRE_RECIPE_URN']
                log('Downloading recipe urn: ' + urn)
                rtype, recipe_id = urn.partition(':')[::2]
                if not recipe_id:
                    raise ValueError('Invalid recipe urn: ' + urn)
                if rtype == 'custom':
                    self.recipe_source = get_custom_recipe(recipe_id)
                else:
                    self.recipe_source = get_builtin_recipe_by_id(
                        urn, log=log, download_recipe=True)
                if not self.recipe_source:
                    raise ValueError('Could not find recipe with urn: ' + urn)
                if not isinstance(self.recipe_source, bytes):
                    self.recipe_source = self.recipe_source.encode('utf-8')
                recipe = compile_recipe(self.recipe_source)
            elif os.access(recipe_or_file, os.R_OK):
                with lopen(recipe_or_file, 'rb') as f:
                    self.recipe_source = f.read()
                recipe = compile_recipe(self.recipe_source)
                log('Using custom recipe')
            else:
                from calibre.web.feeds.recipes.collection import (
                    get_builtin_recipe_by_title, get_builtin_recipe_titles)
                title = getattr(opts, 'original_recipe_input_arg',
                                recipe_or_file)
                title = os.path.basename(title).rpartition('.')[0]
                titles = frozenset(get_builtin_recipe_titles())
                if title not in titles:
                    title = getattr(opts, 'original_recipe_input_arg',
                                    recipe_or_file)
                    title = title.rpartition('.')[0]

                raw = get_builtin_recipe_by_title(
                    title,
                    log=log,
                    download_recipe=not opts.dont_download_recipe)
                builtin = False
                try:
                    recipe = compile_recipe(raw)
                    self.recipe_source = raw
                    if recipe.requires_version > numeric_version:
                        log.warn(
                            'Downloaded recipe needs calibre version at least: %s'
                            % ('.'.join(recipe.requires_version)))
                        builtin = True
                except:
                    log.exception(
                        'Failed to compile downloaded recipe. Falling '
                        'back to builtin one')
                    builtin = True
                if builtin:
                    log('Using bundled builtin recipe')
                    raw = get_builtin_recipe_by_title(title,
                                                      log=log,
                                                      download_recipe=False)
                    if raw is None:
                        raise ValueError('Failed to find builtin recipe: ' +
                                         title)
                    recipe = compile_recipe(raw)
                    self.recipe_source = raw
                else:
                    log('Using downloaded builtin recipe')

            if recipe is None:
                raise ValueError(
                    '%r is not a valid recipe file or builtin recipe' %
                    recipe_or_file)

            disabled = getattr(recipe, 'recipe_disabled', None)
            if disabled is not None:
                raise RecipeDisabled(disabled)
            ro = recipe(opts, log, self.report_progress)
            ro.download()
            self.recipe_object = ro

        for key, val in self.recipe_object.conversion_options.items():
            setattr(opts, key, val)

        for f in os.listdir('.'):
            if f.endswith('.opf'):
                return os.path.abspath(f)

        for f in walk('.'):
            if f.endswith('.opf'):
                return os.path.abspath(f)

    def postprocess_book(self, oeb, opts, log):
        if self.recipe_object is not None:
            self.recipe_object.internal_postprocess_book(oeb, opts, log)
            self.recipe_object.postprocess_book(oeb, opts, log)

    def specialize(self, oeb, opts, log, output_fmt):
        if opts.no_inline_navbars:
            from calibre.ebooks.oeb.base import XPath
            for item in oeb.spine:
                for div in XPath(
                        '//h:div[contains(@class, "calibre_navbar")]')(
                            item.data):
                    div.getparent().remove(div)

    def save_download(self, zf):
        raw = self.recipe_source
        if isinstance(raw, unicode_type):
            raw = raw.encode('utf-8')
        zf.writestr('download.recipe', raw)