class DOCXInput(InputFormatPlugin): name = 'DOCX Input' author = 'Kovid Goyal' description = 'Convert DOCX files (.docx and .docm) to HTML' file_types = {'docx', 'docm'} commit_name = 'docx_input' options = { OptionRecommendation( name='docx_no_cover', recommended_value=False, help='Normally, if a large image is present at the start of the ' 'document that looks like a cover, it will be removed from ' 'the document and used as the cover for created e-book. This ' 'option turns off that behavior.'), OptionRecommendation( name='docx_no_pagebreaks_between_notes', recommended_value=False, help='Do not insert a page break after every endnote.'), OptionRecommendation( name='docx_inline_subsup', recommended_value=False, help='Render superscripts and subscripts so that they do not ' 'affect the line height.'), } recommendations = {('page_breaks_before', '/', OptionRecommendation.MED)} def convert(self, stream, options, file_ext, log, accelerators): from ebook_converter.ebooks.docx.to_html import Convert return Convert(stream, detect_cover=not options.docx_no_cover, log=log, notes_nopb=options.docx_no_pagebreaks_between_notes, nosupsub=options.docx_inline_subsup)()
class PDBOutput(OutputFormatPlugin): name = 'PDB Output' author = 'John Schember' file_type = 'pdb' commit_name = 'pdb_output' ui_data = {'formats': tuple(ALL_FORMAT_WRITERS)} options = { OptionRecommendation( name='format', recommended_value='doc', level=OptionRecommendation.LOW, short_switch='f', choices=list(ALL_FORMAT_WRITERS), help='Format to use inside the pdb container. Choices are: %s' % sorted(ALL_FORMAT_WRITERS)), OptionRecommendation( name='pdb_output_encoding', recommended_value='cp1252', level=OptionRecommendation.LOW, help='Specify the character encoding of the output document. ' 'The default is cp1252. Note: This option is not honored by ' 'all formats.'), OptionRecommendation( name='inline_toc', recommended_value=False, level=OptionRecommendation.LOW, help='Add Table of Contents to beginning of the book.'), } def convert(self, oeb_book, output_path, input_plugin, opts, log): close = False if not hasattr(output_path, 'write'): close = True if not os.path.exists(os.path.dirname( output_path)) and os.path.dirname(output_path): os.makedirs(os.path.dirname(output_path)) out_stream = open(output_path, 'wb') else: out_stream = output_path Writer = get_writer(opts.format) if Writer is None: raise PDBError('No writer available for format %s.' % format) setattr(opts, 'max_line_length', 0) setattr(opts, 'force_max_line_length', False) writer = Writer(opts, log) out_stream.seek(0) out_stream.truncate() writer.write_content(oeb_book, out_stream, oeb_book.metadata) if close: out_stream.close()
class RBOutput(OutputFormatPlugin): name = 'RB Output' author = 'John Schember' file_type = 'rb' commit_name = 'rb_output' options = { OptionRecommendation(name='inline_toc', recommended_value=False, level=OptionRecommendation.LOW, help='Add Table of Contents to beginning of the book.')} def convert(self, oeb_book, output_path, input_plugin, opts, log): from ebook_converter.ebooks.rb.writer import RBWriter close = False if not hasattr(output_path, 'write'): close = True if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path): os.makedirs(os.path.dirname(output_path)) out_stream = open(output_path, 'wb') else: out_stream = output_path writer = RBWriter(opts, log) out_stream.seek(0) out_stream.truncate() writer.write_content(oeb_book, out_stream, oeb_book.metadata) if close: out_stream.close()
class TCROutput(OutputFormatPlugin): name = 'TCR Output' author = 'John Schember' file_type = 'tcr' commit_name = 'tcr_output' options = { OptionRecommendation( name='tcr_output_encoding', recommended_value='utf-8', level=OptionRecommendation.LOW, help='Specify the character encoding of the output document. ' 'The default is utf-8.') } def convert(self, oeb_book, output_path, input_plugin, opts, log): from ebook_converter.ebooks.txt.txtml import TXTMLizer from ebook_converter.ebooks.compression.tcr import compress close = False if not hasattr(output_path, 'write'): close = True if not os.path.exists(os.path.dirname( output_path)) and os.path.dirname(output_path): os.makedirs(os.path.dirname(output_path)) out_stream = open(output_path, 'wb') else: out_stream = output_path setattr(opts, 'flush_paras', False) setattr(opts, 'max_line_length', 0) setattr(opts, 'force_max_line_length', False) setattr(opts, 'indent_paras', False) writer = TXTMLizer(log) txt = writer.extract_content(oeb_book, opts).encode(opts.tcr_output_encoding, 'replace') log.info('Compressing text...') txt = compress(txt) out_stream.seek(0) out_stream.truncate() out_stream.write(txt) if close: out_stream.close()
class ComicInput(InputFormatPlugin): name = 'Comic Input' author = 'Kovid Goyal' description = 'Optimize comic files (.cbz, .cbr, .cbc) for viewing on portable devices' file_types = {'cbz', 'cbr', 'cbc'} is_image_collection = True commit_name = 'comic_input' core_usage = -1 options = { OptionRecommendation( name='colors', recommended_value=0, help='Reduce the number of colors used in the image. This works ' 'only if you choose the PNG output format. It is useful to ' 'reduce file sizes. Set to zero to turn off. Maximum value ' 'is 256. It is off by default.'), OptionRecommendation( name='dont_normalize', recommended_value=False, help='Disable normalize (improve contrast) color range ' 'for pictures. Default: False'), OptionRecommendation( name='keep_aspect_ratio', recommended_value=False, help='Maintain picture aspect ratio. Default is to fill the ' 'screen.'), OptionRecommendation(name='dont_sharpen', recommended_value=False, help='Disable sharpening.'), OptionRecommendation( name='disable_trim', recommended_value=False, help='Disable trimming of comic pages. For some comics, trimming ' 'might remove content as well as borders.'), OptionRecommendation( name='landscape', recommended_value=False, help="Don't split landscape images into two portrait images"), OptionRecommendation( name='wide', recommended_value=False, help="Keep aspect ratio and scale image using screen height as " "image width for viewing in landscape mode."), OptionRecommendation( name='right2left', recommended_value=False, help='Used for right-to-left publications like manga. ' 'Causes landscape pages to be split into portrait pages ' 'from right to left.'), OptionRecommendation( name='despeckle', recommended_value=False, help='Enable Despeckle. Reduces speckle noise. May greatly ' 'increase processing time.'), OptionRecommendation( name='no_sort', recommended_value=False, help="Don't sort the files found in the comic " "alphabetically by name. Instead use the order they were " "added to the comic."), OptionRecommendation( name='output_format', choices=['png', 'jpg'], recommended_value='png', help='The format that images in the created e-book are ' 'converted to. You can experiment to see which format ' 'gives you optimal size and look on your device.'), OptionRecommendation(name='no_process', recommended_value=False, help="Apply no processing to the image"), OptionRecommendation( name='dont_grayscale', recommended_value=False, help='Do not convert the image to grayscale (black and white)'), OptionRecommendation( name='comic_image_size', recommended_value=None, help='Specify the image size as widthxheight pixels. Normally,' ' an image size is automatically calculated from the output ' 'profile, this option overrides it.'), OptionRecommendation( name='dont_add_comic_pages_to_toc', recommended_value=False, help='When converting a CBC do not add links to each page to' ' the TOC. Note this only applies if the TOC has more than ' 'one section'), } recommendations = { ('margin_left', 0, OptionRecommendation.HIGH), ('margin_top', 0, OptionRecommendation.HIGH), ('margin_right', 0, OptionRecommendation.HIGH), ('margin_bottom', 0, OptionRecommendation.HIGH), ('insert_blank_line', False, OptionRecommendation.HIGH), ('remove_paragraph_spacing', False, OptionRecommendation.HIGH), ('change_justification', 'left', OptionRecommendation.HIGH), ('dont_split_on_pagebreaks', True, OptionRecommendation.HIGH), ('chapter', None, OptionRecommendation.HIGH), ('page_breaks_brefore', None, OptionRecommendation.HIGH), ('use_auto_toc', False, OptionRecommendation.HIGH), ('page_breaks_before', None, OptionRecommendation.HIGH), ('disable_font_rescaling', True, OptionRecommendation.HIGH), ('linearize_tables', False, OptionRecommendation.HIGH), } def get_comics_from_collection(self, stream): from ebook_converter.libunzip import extract as zipextract tdir = PersistentTemporaryDirectory('_comic_collection') zipextract(stream, tdir) comics = [] with CurrentDir(tdir): if not os.path.exists('comics.txt'): raise ValueError( ('%s is not a valid comic collection' ' no comics.txt was found in the file') % stream.name) with open('comics.txt', 'rb') as f: raw = f.read() if raw.startswith(codecs.BOM_UTF16_BE): raw = raw.decode('utf-16-be')[1:] elif raw.startswith(codecs.BOM_UTF16_LE): raw = raw.decode('utf-16-le')[1:] elif raw.startswith(codecs.BOM_UTF8): raw = raw.decode('utf-8')[1:] else: raw = raw.decode('utf-8') for line in raw.splitlines(): line = line.strip() if not line: continue fname, title = line.partition(':')[0], line.partition(':')[-1] fname = fname.replace('#', '_') fname = os.path.join(tdir, *fname.split('/')) if not title: title = os.path.basename(fname).rpartition('.')[0] if os.access(fname, os.R_OK): comics.append([title, fname]) if not comics: raise ValueError('%s has no comics' % stream.name) return comics def get_pages(self, comic, tdir2): from ebook_converter.ebooks.comic.input import (extract_comic, process_pages, find_pages) tdir = extract_comic(comic) new_pages = find_pages(tdir, sort_on_mtime=self.opts.no_sort, verbose=self.opts.verbose) thumbnail = None if not new_pages: raise ValueError('Could not find any pages in the comic: %s' % comic) if self.opts.no_process: n2 = [] for i, page in enumerate(new_pages): n2.append( os.path.join(tdir2, '{} - {}'.format(i, os.path.basename(page)))) shutil.copyfile(page, n2[-1]) new_pages = n2 else: new_pages, failures = process_pages(new_pages, self.opts, self.report_progress, tdir2) if failures: self.log.warning('Could not process the following pages ' '(run with --verbose to see why):') for f in failures: self.log.warning('\t', f) if not new_pages: raise ValueError( 'Could not find any valid pages in comic: %s' % comic) thumbnail = os.path.join( tdir2, 'thumbnail.' + self.opts.output_format.lower()) if not os.access(thumbnail, os.R_OK): thumbnail = None return new_pages def get_images(self): return self._images def convert(self, stream, opts, file_ext, log, accelerators): from ebook_converter.ebooks.metadata import MetaInformation from ebook_converter.ebooks.metadata.opf2 import OPFCreator from ebook_converter.ebooks.metadata.toc import TOC self.opts, self.log = opts, log if file_ext == 'cbc': comics_ = self.get_comics_from_collection(stream) else: comics_ = [['Comic', os.path.abspath(stream.name)]] stream.close() comics = [] for i, x in enumerate(comics_): title, fname = x cdir = 'comic_%d' % (i + 1) if len(comics_) > 1 else '.' cdir = os.path.abspath(cdir) if not os.path.exists(cdir): os.makedirs(cdir) pages = self.get_pages(fname, cdir) if not pages: continue if self.for_viewer: comics.append( (title, pages, [self.create_viewer_wrapper(pages)])) else: wrappers = self.create_wrappers(pages) comics.append((title, pages, wrappers)) if not comics: raise ValueError('No comic pages found in %s' % stream.name) mi = MetaInformation( os.path.basename(stream.name).rpartition('.')[0], ['Unknown']) opf = OPFCreator(os.getcwd(), mi) entries = [] def href(x): if len(comics) == 1: return os.path.basename(x) return '/'.join(x.split(os.sep)[-2:]) cover_href = None for comic in comics: pages, wrappers = comic[1:] page_entries = [(x, None) for x in map(href, pages)] entries += [(w, None) for w in map(href, wrappers)] + page_entries if cover_href is None and page_entries: cover_href = page_entries[0][0] opf.create_manifest(entries) spine = [] for comic in comics: spine.extend(map(href, comic[2])) self._images = [] for comic in comics: self._images.extend(comic[1]) opf.create_spine(spine) if self.for_viewer and cover_href: opf.guide.set_cover(cover_href) toc = TOC() if len(comics) == 1: wrappers = comics[0][2] for i, x in enumerate(wrappers): toc.add_item(href(x), None, 'Page %d' % (i + 1), play_order=i) else: po = 0 for comic in comics: po += 1 wrappers = comic[2] stoc = toc.add_item(href(wrappers[0]), None, comic[0], play_order=po) if not opts.dont_add_comic_pages_to_toc: for i, x in enumerate(wrappers): stoc.add_item(href(x), None, 'Page %d' % (i + 1), play_order=po) po += 1 opf.set_toc(toc) with open('metadata.opf', 'wb') as m, open('toc.ncx', 'wb') as n: opf.render(m, n, 'toc.ncx') return os.path.abspath('metadata.opf') def create_wrappers(self, pages): wrappers = [] WRAPPER = textwrap.dedent('''\ <html xmlns="%s"> <head> <meta charset="utf-8"/> <title>Page #%d</title> <style type="text/css"> @page { margin:0pt; padding: 0pt} body { margin: 0pt; padding: 0pt} div { text-align: center } </style> </head> <body> <div> <img src="%s" alt="comic page #%d" /> </div> </body> </html> ''') dir = os.path.dirname(pages[0]) for i, page in enumerate(pages): wrapper = WRAPPER % (const.XHTML_NS, i + 1, os.path.basename(page), i + 1) page = os.path.join(dir, 'page_%d.xhtml' % (i + 1)) with open(page, 'wb') as f: f.write(wrapper.encode('utf-8')) wrappers.append(page) return wrappers def create_viewer_wrapper(self, pages): def page(src): return '<img src="{}"></img>'.format(os.path.basename(src)) pages = '\n'.join(map(page, pages)) base = os.path.dirname(pages[0]) wrapper = ''' <html xmlns="%s"> <head> <meta charset="utf-8"/> <style type="text/css"> html, body, img { height: 100vh; display: block; margin: 0; padding: 0; border-width: 0; } img { width: 100%%; height: 100%%; object-fit: contain; margin-left: auto; margin-right: auto; max-width: 100vw; max-height: 100vh; top: 50vh; transform: translateY(-50%%); position: relative; page-break-after: always; } </style> </head> <body> %s </body> </html> ''' % (const.XHTML_NS, pages) path = os.path.join(base, 'wrapper.xhtml') with open(path, 'wb') as f: f.write(wrapper.encode('utf-8')) return path
class RTFInput(InputFormatPlugin): name = 'RTF Input' author = 'Kovid Goyal' description = 'Convert RTF files to HTML' file_types = {'rtf'} commit_name = 'rtf_input' options = { OptionRecommendation(name='ignore_wmf', recommended_value=False, help='Ignore WMF images instead of ' 'replacing them with a placeholder ' 'image.') } def generate_xml(self, stream): from ebook_converter.ebooks.rtf2xml.ParseRtf import ParseRtf ofile = u'dataxml.xml' run_lev, debug_dir, indent_out = 1, None, 0 if getattr(self.opts, 'debug_pipeline', None) is not None: try: os.mkdir(u'rtfdebug') debug_dir = u'rtfdebug' run_lev = 4 indent_out = 1 self.log('Running RTFParser in debug mode') except Exception: self.log.warn('Impossible to run RTFParser in debug mode') parser = ParseRtf( in_file=stream, out_file=ofile, # Convert symbol fonts to unicode equivalents. Default # is 1 convert_symbol=1, # Convert Zapf fonts to unicode equivalents. Default # is 1. convert_zapf=1, # Convert Wingding fonts to unicode equivalents. # Default is 1. convert_wingdings=1, # Convert RTF caps to real caps. # Default is 1. convert_caps=1, # Indent resulting XML. # Default is 0 (no indent). indent=indent_out, # Form lists from RTF. Default is 1. form_lists=1, # Convert headings to sections. Default is 0. headings_to_sections=1, # Group paragraphs with the same style name. Default is 1. group_styles=1, # Group borders. Default is 1. group_borders=1, # Write or do not write paragraphs. Default is 0. empty_paragraphs=1, # Debug deb_dir=debug_dir, # Default encoding default_encoding=getattr(self.opts, 'input_encoding', 'cp1252') or 'cp1252', # Run level run_level=run_lev, ) parser.parse_rtf() with open(ofile, 'rb') as f: return f.read() def extract_images(self, picts): from ebook_converter.utils.imghdr import what from binascii import unhexlify self.log('Extracting images...') with open(picts, 'rb') as f: raw = f.read() picts = filter(len, re.findall(br'\{\\pict([^}]+)\}', raw)) hex_pat = re.compile(br'[^a-fA-F0-9]') encs = [hex_pat.sub(b'', pict) for pict in picts] count = 0 imap = {} for enc in encs: if len(enc) % 2 == 1: enc = enc[:-1] data = unhexlify(enc) fmt = what(None, data) if fmt is None: fmt = 'wmf' count += 1 name = u'%04d.%s' % (count, fmt) with open(name, 'wb') as f: f.write(data) imap[count] = name # with open(name+'.hex', 'wb') as f: # f.write(enc) return self.convert_images(imap) def convert_images(self, imap): self.default_img = None for count, val in imap.items(): try: imap[count] = self.convert_image(val) except Exception: self.log.exception('Failed to convert', val) return imap def convert_image(self, name): if not name.endswith('.wmf'): return name try: return self.rasterize_wmf(name) except Exception: self.log.exception('Failed to convert WMF image %r' % name) return self.replace_wmf(name) def replace_wmf(self, name): if self.opts.ignore_wmf: os.remove(name) return '__REMOVE_ME__' from ebook_converter.ebooks.covers import message_image if self.default_img is None: self.default_img = message_image('Conversion of WMF images is not ' 'supported. Use Microsoft Word ' 'or OpenOffice to save this RTF ' 'file as HTML and convert that ' 'in calibre.') name = name.replace('.wmf', '.jpg') with open(name, 'wb') as f: f.write(self.default_img) return name def rasterize_wmf(self, name): from ebook_converter.utils.wmf.parse import wmf_unwrap with open(name, 'rb') as f: data = f.read() data = wmf_unwrap(data) name = name.replace('.wmf', '.png') with open(name, 'wb') as f: f.write(data) return name def write_inline_css(self, ic, border_styles): font_size_classes = [ 'span.fs%d { font-size: %spt }' % (i, x) for i, x in enumerate(ic.font_sizes) ] color_classes = [ 'span.col%d { color: %s }' % (i, x) for i, x in enumerate(ic.colors) if x != 'false' ] css = textwrap.dedent(''' span.none { text-decoration: none; font-weight: normal; font-style: normal; font-variant: normal } span.italics { font-style: italic } span.bold { font-weight: bold } span.small-caps { font-variant: small-caps } span.underlined { text-decoration: underline } span.strike-through { text-decoration: line-through } ''') css += '\n' + '\n'.join(font_size_classes) css += '\n' + '\n'.join(color_classes) for cls, val in border_styles.items(): css += '\n\n.%s {\n%s\n}' % (cls, val) with open(u'styles.css', 'ab') as f: f.write(css.encode('utf-8')) def convert_borders(self, doc): border_styles = [] style_map = {} for elem in doc.xpath(r'//*[local-name()="cell"]'): style = [ 'border-style: hidden', 'border-width: 1px', 'border-color: black' ] for x in ('bottom', 'top', 'left', 'right'): bs = elem.get('border-cell-%s-style' % x, None) if bs: cbs = border_style_map.get(bs, 'solid') style.append('border-%s-style: %s' % (x, cbs)) bw = elem.get('border-cell-%s-line-width' % x, None) if bw: style.append('border-%s-width: %spt' % (x, bw)) bc = elem.get('border-cell-%s-color' % x, None) if bc: style.append('border-%s-color: %s' % (x, bc)) style = ';\n'.join(style) if style not in border_styles: border_styles.append(style) idx = border_styles.index(style) cls = 'border_style%d' % idx style_map[cls] = style elem.set('class', cls) return style_map def convert(self, stream, options, file_ext, log, accelerators): from ebook_converter.ebooks.metadata.meta import get_metadata from ebook_converter.ebooks.metadata.opf2 import OPFCreator from ebook_converter.ebooks.rtf2xml.ParseRtf import \ RtfInvalidCodeException from ebook_converter.ebooks.rtf.input import InlineClass self.opts = options self.log = log self.log('Converting RTF to XML...') try: xml = self.generate_xml(stream.name) except RtfInvalidCodeException as e: self.log.exception('Unable to parse RTF') raise ValueError('This RTF file has a feature calibre does not ' 'support. Convert it to HTML first and then try ' 'it.\n%s' % e) d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf')) if d: imap = {} try: imap = self.extract_images(d[0]) except Exception: self.log.exception('Failed to extract images...') self.log('Parsing XML...') doc = etree.fromstring(xml) border_styles = self.convert_borders(doc) for pict in doc.xpath( '//rtf:pict[@num]', namespaces={'rtf': 'http://rtf2xml.sourceforge.net/'}): num = int(pict.get('num')) name = imap.get(num, None) if name is not None: pict.set('num', name) self.log('Converting XML to HTML...') inline_class = InlineClass(self.log) with open( pkg_resources.resource_filename('ebook_converter', 'data/rtf.xsl')) as fobj: styledoc = etree.fromstring(fobj.read()) extensions = {('calibre', 'inline-class'): inline_class} transform = etree.XSLT(styledoc, extensions=extensions) result = transform(doc) html = u'index.xhtml' with open(html, 'wb') as f: res = as_bytes(transform.tostring(result)) # res = res[:100].replace('xmlns:html', 'xmlns') + res[100:] # clean multiple \n res = re.sub(b'\n+', b'\n', res) # Replace newlines inserted by the 'empty_paragraphs' option in # rtf2xml with html blank lines # res = re.sub('\s*<body>', '<body>', res) # res = re.sub('(?<=\n)\n{2}', # u'<p>\u00a0</p>\n'.encode('utf-8'), res) f.write(res) self.write_inline_css(inline_class, border_styles) stream.seek(0) mi = get_metadata(stream, 'rtf') if not mi.title: mi.title = 'Unknown' if not mi.authors: mi.authors = ['Unknown'] opf = OPFCreator(os.getcwd(), mi) opf.create_manifest([(u'index.xhtml', None)]) opf.create_spine([u'index.xhtml']) opf.render(open(u'metadata.opf', 'wb')) return os.path.abspath(u'metadata.opf') def postprocess_book(self, oeb, opts, log): for item in oeb.spine: for img in item.data.xpath('//*[local-name()="img" and ' '@src="__REMOVE_ME__"]'): p = img.getparent() idx = p.index(img) p.remove(img) if img.tail: if idx == 0: p.text = (p.text or '') + img.tail else: p[idx - 1].tail = (p[idx - 1].tail or '') + img.tail
class PDFOutput(OutputFormatPlugin): name = 'PDF Output' author = 'Kovid Goyal' file_type = 'pdf' commit_name = 'pdf_output' ui_data = { 'paper_sizes': PAPER_SIZES, 'units': UNITS, 'font_types': ('serif', 'sans', 'mono') } options = { OptionRecommendation( name='use_profile_size', recommended_value=False, help= 'Instead of using the paper size specified in the PDF Output options,' ' use a paper size corresponding to the current output profile.' ' Useful if you want to generate a PDF for viewing on a specific device.' ), OptionRecommendation( name='unit', recommended_value='inch', level=OptionRecommendation.LOW, short_switch='u', choices=UNITS, help='The unit of measure for page sizes. Default is inch. Choices ' 'are {} ' 'Note: This does not override the unit for margins!'.format( ', '.join(UNITS))), OptionRecommendation( name='paper_size', recommended_value='letter', level=OptionRecommendation.LOW, choices=PAPER_SIZES, help='The size of the paper. This size will be overridden when a ' 'non default output profile is used. Default is letter. Choices ' 'are {}'.format(', '.join(PAPER_SIZES))), OptionRecommendation( name='custom_size', recommended_value=None, help='Custom size of the document. Use the form widthxheight ' 'e.g. `123x321` to specify the width and height. ' 'This overrides any specified paper-size.'), OptionRecommendation( name='preserve_cover_aspect_ratio', recommended_value=False, help='Preserve the aspect ratio of the cover, instead' ' of stretching it to fill the full first page of the' ' generated pdf.'), OptionRecommendation( name='pdf_serif_family', recommended_value='Times', help= 'The font family used to render serif fonts. Will work only if the font is available system-wide.' ), OptionRecommendation( name='pdf_sans_family', recommended_value='Helvetica', help= 'The font family used to render sans-serif fonts. Will work only if the font is available system-wide.' ), OptionRecommendation( name='pdf_mono_family', recommended_value='Courier', help= 'The font family used to render monospace fonts. Will work only if the font is available system-wide.' ), OptionRecommendation( name='pdf_standard_font', choices=ui_data['font_types'], recommended_value='serif', help='The font family used to render monospace fonts'), OptionRecommendation(name='pdf_default_font_size', recommended_value=20, help='The default font size'), OptionRecommendation(name='pdf_mono_font_size', recommended_value=16, help='The default font size for monospaced text'), OptionRecommendation( name='pdf_hyphenate', recommended_value=False, help= 'Break long words at the end of lines. This can give the text at the right margin a more even appearance.' ), OptionRecommendation( name='pdf_mark_links', recommended_value=False, help='Surround all links with a red box, useful for debugging.'), OptionRecommendation( name='pdf_page_numbers', recommended_value=False, help= 'Add page numbers to the bottom of every page in the generated PDF file. If you ' 'specify a footer template, it will take precedence ' 'over this option.'), OptionRecommendation( name='pdf_footer_template', recommended_value=None, help='An HTML template used to generate %s on every page.' ' The strings _PAGENUM_, _TITLE_, _AUTHOR_ and _SECTION_ will be replaced by their current values.' % 'footers'), OptionRecommendation( name='pdf_header_template', recommended_value=None, help='An HTML template used to generate %s on every page.' ' The strings _PAGENUM_, _TITLE_, _AUTHOR_ and _SECTION_ will be replaced by their current values.' % 'headers'), OptionRecommendation( name='pdf_add_toc', recommended_value=False, help= 'Add a Table of Contents at the end of the PDF that lists page numbers. ' 'Useful if you want to print out the PDF. If this PDF is intended for electronic use, use the PDF Outline instead.' ), OptionRecommendation(name='toc_title', recommended_value=None, help='Title for generated table of contents.'), OptionRecommendation( name='pdf_page_margin_left', recommended_value=72.0, level=OptionRecommendation.LOW, help='The size of the left page margin, in pts. Default is 72pt.' ' Overrides the common left page margin setting.'), OptionRecommendation( name='pdf_page_margin_top', recommended_value=72.0, level=OptionRecommendation.LOW, help='The size of the top page margin, in pts. Default is 72pt.' ' Overrides the common top page margin setting, unless set to zero.' ), OptionRecommendation( name='pdf_page_margin_right', recommended_value=72.0, level=OptionRecommendation.LOW, help='The size of the right page margin, in pts. Default is 72pt.' ' Overrides the common right page margin setting, unless set to zero.' ), OptionRecommendation( name='pdf_page_margin_bottom', recommended_value=72.0, level=OptionRecommendation.LOW, help='The size of the bottom page margin, in pts. Default is 72pt.' ' Overrides the common bottom page margin setting, unless set to zero.' ), OptionRecommendation( name='pdf_use_document_margins', recommended_value=False, help= 'Use the page margins specified in the input document via @page CSS rules.' ' This will cause the margins specified in the conversion settings to be ignored.' ' If the document does not specify page margins, the conversion settings will be used as a fallback.' ), OptionRecommendation( name='pdf_page_number_map', recommended_value=None, help= 'Adjust page numbers, as needed. Syntax is a JavaScript expression for the page number.' ' For example, "if (n < 3) 0; else n - 3;", where n is current page number.' ), OptionRecommendation( name='uncompressed_pdf', recommended_value=False, help='Generate an uncompressed PDF, useful for debugging.'), OptionRecommendation( name='pdf_odd_even_offset', recommended_value=0.0, level=OptionRecommendation.LOW, help='Shift the text horizontally by the specified offset (in pts).' ' On odd numbered pages, it is shifted to the right and on even' ' numbered pages to the left. Use negative numbers for the opposite' ' effect. Note that this setting is ignored on pages where the margins' ' are smaller than the specified offset. Shifting is done by setting' ' the PDF CropBox, not all software respects the CropBox.') } def specialize_options(self, log, opts, input_fmt): # Ensure Qt is setup to be used with WebEngine # specialize_options is called early enough in the pipeline # that hopefully no Qt application has been constructed as yet from PyQt5.QtWebEngineCore import QWebEngineUrlScheme from PyQt5.QtWebEngineWidgets import QWebEnginePage # noqa from ebook_converter.gui2 import must_use_qt from ebook_converter.constants_old import FAKE_PROTOCOL scheme = QWebEngineUrlScheme(FAKE_PROTOCOL.encode('ascii')) scheme.setSyntax(QWebEngineUrlScheme.Syntax.Host) scheme.setFlags(QWebEngineUrlScheme.SecureScheme) QWebEngineUrlScheme.registerScheme(scheme) must_use_qt() self.input_fmt = input_fmt if opts.pdf_use_document_margins: # Prevent the conversion pipeline from overwriting document margins opts.margin_left = opts.margin_right = opts.margin_top = opts.margin_bottom = -1 def convert(self, oeb_book, output_path, input_plugin, opts, log): self.stored_page_margins = getattr(opts, '_stored_page_margins', {}) self.oeb = oeb_book self.input_plugin, self.opts, self.log = input_plugin, opts, log self.output_path = output_path from ebook_converter.ebooks.oeb.base import OPF, OPF2_NS from lxml import etree from io import BytesIO package = etree.Element(OPF('package'), attrib={ 'version': '2.0', 'unique-identifier': 'dummy' }, nsmap={None: OPF2_NS}) from ebook_converter.ebooks.metadata.opf2 import OPF self.oeb.metadata.to_opf2(package) self.metadata = OPF(BytesIO( etree.tostring(package))).to_book_metadata() self.cover_data = None if input_plugin.is_image_collection: log.debug('Converting input as an image collection...') self.convert_images(input_plugin.get_images()) else: log.debug('Converting input as a text based book...') self.convert_text(oeb_book) def convert_images(self, images): from ebook_converter.ebooks.pdf.image_writer import convert convert(images, self.output_path, self.opts, self.metadata, self.report_progress) def get_cover_data(self): oeb = self.oeb if (oeb.metadata.cover and str(oeb.metadata.cover[0]) in oeb.manifest.ids): cover_id = str(oeb.metadata.cover[0]) item = oeb.manifest.ids[cover_id] self.cover_data = item.data def process_fonts(self): ''' Make sure all fonts are embeddable ''' from ebook_converter.ebooks.oeb.base import urlnormalize from ebook_converter.utils.fonts.utils import remove_embed_restriction processed = set() for item in list(self.oeb.manifest): if not hasattr(item.data, 'cssRules'): continue for i, rule in enumerate(item.data.cssRules): if rule.type == rule.FONT_FACE_RULE: try: s = rule.style src = s.getProperty('src').propertyValue[0].uri except: continue path = item.abshref(src) ff = self.oeb.manifest.hrefs.get(urlnormalize(path), None) if ff is None: continue raw = nraw = ff.data if path not in processed: processed.add(path) try: nraw = remove_embed_restriction(raw) except: continue if nraw != raw: ff.data = nraw self.oeb.container.write(path, nraw) def convert_text(self, oeb_book): import json from ebook_converter.ebooks.pdf.html_writer import convert self.get_cover_data() self.process_fonts() if self.opts.pdf_use_document_margins and self.stored_page_margins: for href, margins in self.stored_page_margins.items(): item = oeb_book.manifest.hrefs.get(href) if item is not None: root = item.data if hasattr(root, 'xpath') and margins: root.set('data-calibre-pdf-output-page-margins', json.dumps(margins)) with TemporaryDirectory('_pdf_out') as oeb_dir: from ebook_converter.customize.ui import plugin_for_output_format oeb_dir = os.path.realpath(oeb_dir) oeb_output = plugin_for_output_format('oeb') oeb_output.convert(oeb_book, oeb_dir, self.input_plugin, self.opts, self.log) opfpath = glob.glob(os.path.join(oeb_dir, '*.opf'))[0] convert(opfpath, self.opts, metadata=self.metadata, output_path=self.output_path, log=self.log, cover_data=self.cover_data, report_progress=self.report_progress)
class TXTInput(InputFormatPlugin): name = 'TXT Input' author = 'John Schember' description = 'Convert TXT files to HTML' file_types = {'txt', 'txtz', 'text', 'md', 'textile', 'markdown'} commit_name = 'txt_input' ui_data = { 'md_extensions': MD_EXTENSIONS, 'paragraph_types': { 'auto': 'Try to auto detect paragraph type', 'block': 'Treat a blank line as a paragraph break', 'single': 'Assume every line is a paragraph', 'print': 'Assume every line starting with 2+ spaces or a tab ' 'starts a paragraph', 'unformatted': 'Most lines have hard line breaks, few/no blank ' 'lines or indents', 'off': 'Don\'t modify the paragraph structure', }, 'formatting_types': { 'auto': 'Automatically decide which formatting processor to use', 'plain': 'No formatting', 'heuristic': 'Use heuristics to determine chapter headings, ' 'italics, etc.', 'textile': 'Use the TexTile markup language', 'markdown': 'Use the Markdown markup language' }, } options = { OptionRecommendation(name='formatting_type', recommended_value='auto', choices=list(ui_data['formatting_types']), help='Formatting used within the document.\n' '* auto: {auto}\n' '* plain: {plain}\n' '* heuristic: {heuristic}\n' '* textile: {textile}\n' '* markdown: {markdown}\n' 'To learn more about markdown see ' '{url}'.format( url='https://daringfireball.net/projects/' 'markdown/', **ui_data['formatting_types'])), OptionRecommendation( name='paragraph_type', recommended_value='auto', choices=list(ui_data['paragraph_types']), help='Paragraph structure to assume. The value of "off" is useful ' 'for formatted documents such as Markdown or Textile. ' 'Choices are:\n' '* auto: {auto}\n' '* block: {block}\n' '* single: {single}\n' '* print: {print}\n' '* unformatted: {unformatted}\n' '* off: {off}'.format(**ui_data['paragraph_types'])), OptionRecommendation( name='preserve_spaces', recommended_value=False, help='Normally extra spaces are condensed into a single space. ' 'With this option all spaces will be displayed.'), OptionRecommendation( name='txt_in_remove_indents', recommended_value=False, help='Normally extra space at the beginning of lines is retained. ' 'With this option they will be removed.'), OptionRecommendation( name="markdown_extensions", recommended_value='footnotes, tables, toc', help='Enable extensions to markdown syntax. Extensions are ' 'formatting that is not part of the standard markdown ' 'format. The extensions enabled by default: %default.\nTo ' 'learn more about markdown extensions, see {}\nThis should ' 'be a comma separated list of extensions to enable:' '\n'.format('https://python-markdown.github.io/extensions/') + '\n'.join('* %s: %s' % (k, MD_EXTENSIONS[k]) for k in sorted(MD_EXTENSIONS))), } def shift_file(self, fname, data): name, ext = os.path.splitext(fname) candidate = os.path.join(self.output_dir, fname) c = 0 while os.path.exists(candidate): c += 1 candidate = os.path.join(self.output_dir, '{}-{}{}'.format(name, c, ext)) ans = candidate with open(ans, 'wb') as f: f.write(data) return f.name def fix_resources(self, html, base_dir): from html5_parser import parse root = parse(html) changed = False for img in root.xpath('//img[@src]'): src = img.get('src') prefix = src.split(':', 1)[0].lower() if prefix not in ('file', 'http', 'https', 'ftp') and not os.path.isabs(src): src = os.path.join(base_dir, src) if os.access(src, os.R_OK): with open(src, 'rb') as f: data = f.read() f = self.shift_file(os.path.basename(src), data) changed = True img.set('src', os.path.basename(f)) if changed: from lxml import etree html = etree.tostring(root, encoding='unicode') return html def convert(self, stream, options, file_ext, log, accelerators): from ebook_converter.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator from ebook_converter.ebooks.chardet import detect from ebook_converter.utils.zipfile import ZipFile from ebook_converter.ebooks.txt.processor import ( convert_basic, convert_markdown_with_metadata, separate_paragraphs_single_line, separate_paragraphs_print_formatted, preserve_spaces, detect_paragraph_type, detect_formatting_type, normalize_line_endings, convert_textile, remove_indents, block_to_single_line, separate_hard_scene_breaks) self.log = log txt = b'' log.debug('Reading text from file...') length = 0 base_dir = self.output_dir = os.getcwd() # Extract content from zip archive. if file_ext == 'txtz': zf = ZipFile(stream) zf.extractall('.') for root, _, fnames in os.walk('.'): for x in fnames: x = os.path.join(root, x) if os.path.splitext(x)[1].lower() in ('.txt', '.text'): with open(x, 'rb') as tf: txt += tf.read() + b'\n\n' else: if getattr(stream, 'name', None): base_dir = os.path.dirname(stream.name) txt = stream.read() if file_ext in {'md', 'textile', 'markdown'}: options.formatting_type = { 'md': 'markdown' }.get(file_ext, file_ext) log.info( 'File extension indicates particular formatting. ' 'Forcing formatting type to: %s', options.formatting_type) options.paragraph_type = 'off' # Get the encoding of the document. if options.input_encoding: ienc = options.input_encoding log.debug('Using user specified input encoding of %s', ienc) else: det_encoding = detect(txt[:4096]) det_encoding, confidence = det_encoding['encoding'], det_encoding[ 'confidence'] if det_encoding and det_encoding.lower().replace( '_', '-').strip() in ('gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn', 'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'): # Microsoft Word exports to HTML with encoding incorrectly set to # gb2312 instead of gbk. gbk is a superset of gb2312, anyway. det_encoding = 'gbk' ienc = det_encoding log.debug( 'Detected input encoding as %s with a confidence of ' '%s%%', ienc, confidence * 100) if not ienc: ienc = 'utf-8' log.debug( 'No input encoding specified and could not auto detect ' 'using %s', ienc) # Remove BOM from start of txt as its presence can confuse markdown import codecs for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8, codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE): if txt.startswith(bom): txt = txt[len(bom):] break txt = txt.decode(ienc, 'replace') # Replace entities txt = entities.ENT_PAT.sub(entities.xml_entity_to_unicode, txt) # Normalize line endings txt = normalize_line_endings(txt) # Determine the paragraph type of the document. if options.paragraph_type == 'auto': options.paragraph_type = detect_paragraph_type(txt) if options.paragraph_type == 'unknown': log.debug('Could not reliably determine paragraph type using ' 'block') options.paragraph_type = 'block' else: log.debug('Auto detected paragraph type as %s', options.paragraph_type) # Detect formatting if options.formatting_type == 'auto': options.formatting_type = detect_formatting_type(txt) log.debug('Auto detected formatting as %s', options.formatting_type) if options.formatting_type == 'heuristic': setattr(options, 'enable_heuristics', True) setattr(options, 'unwrap_lines', False) setattr(options, 'smarten_punctuation', True) # Reformat paragraphs to block formatting based on the detected type. # We don't check for block because the processor assumes block. # single and print at transformed to block for processing. if options.paragraph_type == 'single': txt = separate_paragraphs_single_line(txt) elif options.paragraph_type == 'print': txt = separate_hard_scene_breaks(txt) txt = separate_paragraphs_print_formatted(txt) txt = block_to_single_line(txt) elif options.paragraph_type == 'unformatted': from ebook_converter.ebooks.conversion.utils import HeuristicProcessor # unwrap lines based on punctuation docanalysis = DocAnalysis('txt', txt) length = docanalysis.line_length(.5) preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None)) txt = preprocessor.punctuation_unwrap(length, txt, 'txt') txt = separate_paragraphs_single_line(txt) elif options.paragraph_type == 'block': txt = separate_hard_scene_breaks(txt) txt = block_to_single_line(txt) if getattr(options, 'enable_heuristics', False) and getattr( options, 'dehyphenate', False): docanalysis = DocAnalysis('txt', txt) if not length: length = docanalysis.line_length(.5) dehyphenator = Dehyphenator(options.verbose, log=self.log) txt = dehyphenator(txt, 'txt', length) # User requested transformation on the text. if options.txt_in_remove_indents: txt = remove_indents(txt) # Preserve spaces will replace multiple spaces to a space # followed by the entity. if options.preserve_spaces: txt = preserve_spaces(txt) # Process the text using the appropriate text processor. self.shifted_files = [] try: html = '' input_mi = None if options.formatting_type == 'markdown': log.debug('Running text through markdown conversion...') try: input_mi, html = convert_markdown_with_metadata( txt, extensions=[ x.strip() for x in options.markdown_extensions.split(',') if x.strip() ]) except RuntimeError: raise ValueError( 'This txt file has malformed markup, it cannot be' ' converted by calibre. See https://daringfireball.net/projects/markdown/syntax' ) html = self.fix_resources(html, base_dir) elif options.formatting_type == 'textile': log.debug('Running text through textile conversion...') html = convert_textile(txt) html = self.fix_resources(html, base_dir) else: log.debug('Running text through basic conversion...') flow_size = getattr(options, 'flow_size', 0) html = convert_basic(txt, epub_split_size_kb=flow_size) # Run the HTMLized text through the html processing plugin. from ebook_converter.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) options.input_encoding = 'utf-8' htmlfile = self.shift_file('index.html', html.encode('utf-8')) odi = options.debug_pipeline options.debug_pipeline = None # Generate oeb from html conversion. oeb = html_input.convert(open(htmlfile, 'rb'), options, 'html', log, {}) options.debug_pipeline = odi finally: for x in self.shifted_files: os.remove(x) # Set metadata from file. if input_mi is None: from ebook_converter.customize.ui import get_file_type_metadata input_mi = get_file_type_metadata(stream, file_ext) from ebook_converter.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata meta_info_to_oeb_metadata(input_mi, oeb.metadata, log) self.html_postprocess_title = input_mi.title return oeb def postprocess_book(self, oeb, opts, log): for item in oeb.spine: if hasattr(item.data, 'xpath'): for title in item.data.xpath('//*[local-name()="title"]'): if title.text == 'Unknown': title.text = self.html_postprocess_title
class SNBOutput(OutputFormatPlugin): name = 'SNB Output' author = 'Li Fanxi' file_type = 'snb' commit_name = 'snb_output' options = { OptionRecommendation( name='snb_output_encoding', recommended_value='utf-8', level=OptionRecommendation.LOW, help='Specify the character encoding of the output document. ' 'The default is utf-8.'), OptionRecommendation( name='snb_max_line_length', recommended_value=0, level=OptionRecommendation.LOW, help='The maximum number of characters per line. This splits on ' 'the first space before the specified value. If no space is ' 'found the line will be broken at the space after and will ' 'exceed the specified value. Also, there is a minimum of 25 ' 'characters. Use 0 to disable line splitting.'), OptionRecommendation( name='snb_insert_empty_line', recommended_value=False, level=OptionRecommendation.LOW, help='Specify whether or not to insert an empty line between two ' 'paragraphs.'), OptionRecommendation( name='snb_dont_indent_first_line', recommended_value=False, level=OptionRecommendation.LOW, help='Specify whether or not to insert two space characters to ' 'indent the first line of each paragraph.'), OptionRecommendation( name='snb_hide_chapter_name', recommended_value=False, level=OptionRecommendation.LOW, help='Specify whether or not to hide the chapter title for each ' 'chapter. Useful for image-only output (eg. comics).'), OptionRecommendation( name='snb_full_screen', recommended_value=False, level=OptionRecommendation.LOW, help='Resize all the images for full screen view. '), } def convert(self, oeb_book, output_path, input_plugin, opts, log): from lxml import etree from ebook_converter.ebooks.snb.snbfile import SNBFile from ebook_converter.ebooks.snb.snbml import SNBMLizer, ProcessFileName self.opts = opts from ebook_converter.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable try: rasterizer = SVGRasterizer() rasterizer(oeb_book, opts) except Unavailable: log.warn('SVG rasterizer unavailable, SVG will not be converted') # Create temp dir with TemporaryDirectory('_snb_output') as tdir: # Create stub directories snbfDir = os.path.join(tdir, 'snbf') snbcDir = os.path.join(tdir, 'snbc') snbiDir = os.path.join(tdir, 'snbc/images') os.mkdir(snbfDir) os.mkdir(snbcDir) os.mkdir(snbiDir) # Process Meta data meta = oeb_book.metadata if meta.title: title = str(meta.title[0]) else: title = '' authors = [str(x) for x in meta.creator if x.role == 'aut'] if meta.publisher: publishers = str(meta.publisher[0]) else: publishers = '' if meta.language: lang = str(meta.language[0]).upper() else: lang = '' if meta.description: abstract = str(meta.description[0]) else: abstract = '' # Process Cover g, m, s = oeb_book.guide, oeb_book.manifest, oeb_book.spine href = None if 'titlepage' not in g: if 'cover' in g: href = g['cover'].href # Output book info file bookInfoTree = etree.Element("book-snbf", version="1.0") headTree = etree.SubElement(bookInfoTree, "head") etree.SubElement(headTree, "name").text = title etree.SubElement(headTree, "author").text = ' '.join(authors) etree.SubElement(headTree, "language").text = lang etree.SubElement(headTree, "rights") etree.SubElement(headTree, "publisher").text = publishers etree.SubElement( headTree, "generator").text = __appname__ + ' ' + __version__ etree.SubElement(headTree, "created") etree.SubElement(headTree, "abstract").text = abstract if href is not None: etree.SubElement(headTree, "cover").text = ProcessFileName(href) else: etree.SubElement(headTree, "cover") with open(os.path.join(snbfDir, 'book.snbf'), 'wb') as f: f.write( etree.tostring(bookInfoTree, pretty_print=True, encoding='utf-8')) # Output TOC tocInfoTree = etree.Element("toc-snbf") tocHead = etree.SubElement(tocInfoTree, "head") tocBody = etree.SubElement(tocInfoTree, "body") outputFiles = {} if oeb_book.toc.count() == 0: log.warn('This SNB file has no Table of Contents. ' 'Creating a default TOC') first = next(iter(oeb_book.spine)) oeb_book.toc.add('Start page', first.href) else: first = next(iter(oeb_book.spine)) if oeb_book.toc[0].href != first.href: # The pages before the fist item in toc will be stored as # "Cover Pages". # oeb_book.toc does not support "insert", so we generate # the tocInfoTree directly instead of modifying the toc ch = etree.SubElement(tocBody, "chapter") ch.set("src", ProcessFileName(first.href) + ".snbc") ch.text = 'Cover pages' outputFiles[first.href] = [] outputFiles[first.href].append(("", "Cover pages")) for tocitem in oeb_book.toc: if tocitem.href.find('#') != -1: item = tocitem.href.split('#') if len(item) != 2: log.error('Error in TOC item: %s' % tocitem) else: if item[0] in outputFiles: outputFiles[item[0]].append( (item[1], tocitem.title)) else: outputFiles[item[0]] = [] if "" not in outputFiles[item[0]]: outputFiles[item[0]].append( ("", tocitem.title + " (Preface)")) ch = etree.SubElement(tocBody, "chapter") ch.set("src", ProcessFileName(item[0]) + ".snbc") ch.text = tocitem.title + " (Preface)" outputFiles[item[0]].append( (item[1], tocitem.title)) else: if tocitem.href in outputFiles: outputFiles[tocitem.href].append(("", tocitem.title)) else: outputFiles[tocitem.href] = [] outputFiles[tocitem.href].append(("", tocitem.title)) ch = etree.SubElement(tocBody, "chapter") ch.set("src", ProcessFileName(tocitem.href) + ".snbc") ch.text = tocitem.title etree.SubElement(tocHead, "chapters").text = '%d' % len(tocBody) with open(os.path.join(snbfDir, 'toc.snbf'), 'wb') as f: f.write( etree.tostring(tocInfoTree, pretty_print=True, encoding='utf-8')) # Output Files oldTree = None mergeLast = False lastName = None for item in s: from ebook_converter.ebooks.oeb.base import OEB_DOCS, OEB_IMAGES if m.hrefs[item.href].media_type in OEB_DOCS: if item.href not in outputFiles: log.debug( 'File %s is unused in TOC. Continue in last chapter' % item.href) mergeLast = True else: if oldTree is not None and mergeLast: log.debug('Output the modified chapter again: %s' % lastName) with open(os.path.join(snbcDir, lastName), 'wb') as f: f.write( etree.tostring(oldTree, pretty_print=True, encoding='utf-8')) mergeLast = False log.debug('Converting %s to snbc...' % item.href) snbwriter = SNBMLizer(log) snbcTrees = None if not mergeLast: snbcTrees = snbwriter.extract_content( oeb_book, item, outputFiles[item.href], opts) for subName in snbcTrees: postfix = '' if subName != '': postfix = '_' + subName lastName = ProcessFileName(item.href + postfix + ".snbc") oldTree = snbcTrees[subName] with open(os.path.join(snbcDir, lastName), 'wb') as f: f.write( etree.tostring(oldTree, pretty_print=True, encoding='utf-8')) else: log.debug('Merge %s with last TOC item...' % item.href) snbwriter.merge_content(oldTree, oeb_book, item, [('', "Start")], opts) # Output the last one if needed log.debug('Output the last modified chapter again: %s' % lastName) if oldTree is not None and mergeLast: with open(os.path.join(snbcDir, lastName), 'wb') as f: f.write( etree.tostring(oldTree, pretty_print=True, encoding='utf-8')) mergeLast = False for item in m: if m.hrefs[item.href].media_type in OEB_IMAGES: log.debug('Converting image: %s ...' % item.href) content = m.hrefs[item.href].data # Convert & Resize image self.HandleImage( content, os.path.join(snbiDir, ProcessFileName(item.href))) # Package as SNB File snbFile = SNBFile() snbFile.FromDir(tdir) snbFile.Output(output_path) def HandleImage(self, imageData, imagePath): from ebook_converter.utils.img import image_from_data, resize_image, image_to_data img = image_from_data(imageData) x, y = img.width(), img.height() if self.opts: if self.opts.snb_full_screen: SCREEN_X, SCREEN_Y = self.opts.output_profile.screen_size else: SCREEN_X, SCREEN_Y = self.opts.output_profile.comic_screen_size else: SCREEN_X = 540 SCREEN_Y = 700 # Handle big image only if x > SCREEN_X or y > SCREEN_Y: xScale = float(x) / SCREEN_X yScale = float(y) / SCREEN_Y scale = max(xScale, yScale) # TODO : intelligent image rotation # img = img.rotate(90) # x,y = y,x img = resize_image(img, x // scale, y // scale) with open(imagePath, 'wb') as f: f.write(image_to_data(img, fmt=imagePath.rpartition('.')[-1]))
class EPUBOutput(OutputFormatPlugin): name = 'EPUB Output' author = 'Kovid Goyal' file_type = 'epub' commit_name = 'epub_output' ui_data = {'versions': ('2', '3')} options = { OptionRecommendation(name='extract_to', help='Extract the contents of the generated %s file to the ' 'specified directory. The contents of the directory are ' 'first deleted, so be careful.' % 'EPUB'), OptionRecommendation(name='dont_split_on_page_breaks', recommended_value=False, level=OptionRecommendation.LOW, help='Turn off splitting at page breaks. Normally, input ' 'files are automatically split at every page break into ' 'two files. This gives an output e-book that can be ' 'parsed faster and with less resources. However, ' 'splitting is slow and if your source file contains a ' 'very large number of page breaks, you should turn off ' 'splitting on page breaks.' ), OptionRecommendation(name='flow_size', recommended_value=260, help='Split all HTML files larger than this size (in KB). ' 'This is necessary as most EPUB readers cannot handle large ' 'file sizes. The default of %defaultKB is the size required ' 'for Adobe Digital Editions. Set to 0 to disable size based ' 'splitting.' ), OptionRecommendation(name='no_default_epub_cover', recommended_value=False, help='Normally, if the input file has no cover and you don\'t' ' specify one, a default cover is generated with the title, ' 'authors, etc. This option disables the generation of this cover.' ), OptionRecommendation(name='no_svg_cover', recommended_value=False, help='Do not use SVG for the book cover. Use this option if ' 'your EPUB is going to be used on a device that does not ' 'support SVG, like the iPhone or the JetBook Lite. ' 'Without this option, such devices will display the cover ' 'as a blank page.' ), OptionRecommendation(name='preserve_cover_aspect_ratio', recommended_value=False, help='When using an SVG cover, this option will cause the cover ' 'to scale to cover the available screen area, but still ' 'preserve its aspect ratio (ratio of width to height). That ' 'means there may be white borders at the sides or top and ' 'bottom of the image, but the image will never be distorted. ' 'Without this option the image may be slightly distorted, ' 'but there will be no borders.' ), OptionRecommendation(name='epub_flatten', recommended_value=False, help='This option is needed only if you intend to use the EPUB' ' with FBReaderJ. It will flatten the file system inside the' ' EPUB, putting all files into the top level.' ), OptionRecommendation(name='epub_inline_toc', recommended_value=False, help='Insert an inline Table of Contents that will appear as part ' 'of the main book content.' ), OptionRecommendation(name='epub_toc_at_end', recommended_value=False, help='Put the inserted inline Table of Contents at the end of ' 'the book instead of the start.' ), OptionRecommendation(name='toc_title', recommended_value=None, help='Title for any generated in-line table of contents.' ), OptionRecommendation(name='epub_version', recommended_value='2', choices=ui_data['versions'], help='The version of the EPUB file to generate. EPUB 2 is the ' 'most widely compatible, only use EPUB 3 if you know you ' 'actually need it.' ) } recommendations = {('pretty_print', True, OptionRecommendation.HIGH)} def workaround_webkit_quirks(self): # {{{ for x in self.oeb.spine: root = x.data body = base.XPath('//h:body')(root) if body: body = body[0] if not hasattr(body, 'xpath'): continue for pre in base.XPath('//h:pre')(body): if not pre.text and len(pre) == 0: pre.tag = 'div' # }}} def upshift_markup(self): # {{{ 'Upgrade markup to comply with XHTML 1.1 where possible' for x in self.oeb.spine: root = x.data if (not root.get(base.tag('xml', 'lang'))) and (root.get('lang')): root.set(base.tag('xml', 'lang'), root.get('lang')) body = base.XPath('//h:body')(root) if body: body = body[0] if not hasattr(body, 'xpath'): continue for u in base.XPath('//h:u')(root): u.tag = 'span' seen_ids, seen_names = set(), set() for x in base.XPath('//*[@id or @name]')(root): eid, name = x.get('id', None), x.get('name', None) if eid: if eid in seen_ids: del x.attrib['id'] else: seen_ids.add(eid) if name: if name in seen_names: del x.attrib['name'] else: seen_names.add(name) # }}} def convert(self, oeb, output_path, input_plugin, opts, log): self.log, self.opts, self.oeb = log, opts, oeb if self.opts.epub_inline_toc: from ebook_converter.ebooks.mobi.writer8.toc import TOCAdder opts.mobi_toc_at_start = not opts.epub_toc_at_end opts.mobi_passthrough = False opts.no_inline_toc = False TOCAdder(oeb, opts, replace_previous_inline_toc=True, ignore_existing_toc=True) if self.opts.epub_flatten: from ebook_converter.ebooks.oeb.transforms.filenames import FlatFilenames FlatFilenames()(oeb, opts) else: from ebook_converter.ebooks.oeb.transforms.filenames import UniqueFilenames UniqueFilenames()(oeb, opts) self.workaround_ade_quirks() self.workaround_webkit_quirks() self.upshift_markup() from ebook_converter.ebooks.oeb.transforms.rescale import RescaleImages RescaleImages(check_colorspaces=True)(oeb, opts) from ebook_converter.ebooks.oeb.transforms.split import Split split = Split(not self.opts.dont_split_on_page_breaks, max_flow_size=self.opts.flow_size*1024 ) split(self.oeb, self.opts) from ebook_converter.ebooks.oeb.transforms.cover import CoverManager cm = CoverManager( no_default_cover=self.opts.no_default_epub_cover, no_svg_cover=self.opts.no_svg_cover, preserve_aspect_ratio=self.opts.preserve_cover_aspect_ratio) cm(self.oeb, self.opts, self.log) self.workaround_sony_quirks() if self.oeb.toc.count() == 0: self.log.warn('This EPUB file has no Table of Contents. ' 'Creating a default TOC') first = next(iter(self.oeb.spine)) self.oeb.toc.add('Start', first.href) identifiers = oeb.metadata['identifier'] _uuid = None for x in identifiers: if (x.get(base.tag('opf', 'scheme'), None).lower() == 'uuid' or str(x).startswith('urn:uuid:')): _uuid = str(x).split(':')[-1] break encrypted_fonts = getattr(input_plugin, 'encrypted_fonts', []) if _uuid is None: self.log.warn('No UUID identifier found') _uuid = str(uuid.uuid4()) oeb.metadata.add('identifier', _uuid, scheme='uuid', id=_uuid) if encrypted_fonts and not _uuid.startswith('urn:uuid:'): # Apparently ADE requires this value to start with urn:uuid: # for some absurd reason, or it will throw a hissy fit and refuse # to use the obfuscated fonts. for x in identifiers: if str(x) == _uuid: x.content = 'urn:uuid:' + _uuid with TemporaryDirectory('_epub_output') as tdir: from ebook_converter.customize.ui import plugin_for_output_format metadata_xml = None extra_entries = [] if self.is_periodical: if self.opts.output_profile.epub_periodical_format == 'sony': from ebook_converter.ebooks.epub.periodical import sony_metadata metadata_xml, atom_xml = sony_metadata(oeb) extra_entries = [('atom.xml', 'application/atom+xml', atom_xml)] oeb_output = plugin_for_output_format('oeb') oeb_output.convert(oeb, tdir, input_plugin, opts, log) opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0] self.condense_ncx([os.path.join(tdir, x) for x in os.listdir(tdir) if x.endswith('.ncx')][0]) if self.opts.epub_version == '3': self.upgrade_to_epub3(tdir, opf) encryption = None if encrypted_fonts: encryption = self.encrypt_fonts(encrypted_fonts, tdir, _uuid) from ebook_converter.ebooks.epub import initialize_container with initialize_container(output_path, os.path.basename(opf), extra_entries=extra_entries) as epub: epub.add_dir(tdir) if encryption is not None: epub.writestr('META-INF/encryption.xml', as_bytes(encryption)) if metadata_xml is not None: epub.writestr('META-INF/metadata.xml', metadata_xml.encode('utf-8')) if opts.extract_to is not None: from ebook_converter.utils.zipfile import ZipFile if os.path.exists(opts.extract_to): if os.path.isdir(opts.extract_to): shutil.rmtree(opts.extract_to) else: os.remove(opts.extract_to) os.mkdir(opts.extract_to) with ZipFile(output_path) as zf: zf.extractall(path=opts.extract_to) self.log.info('EPUB extracted to', opts.extract_to) def upgrade_to_epub3(self, tdir, opf): self.log.info('Upgrading to EPUB 3...') from ebook_converter.ebooks.epub import simple_container_xml from ebook_converter.ebooks.oeb.polish.cover import fix_conversion_titlepage_links_in_nav try: os.mkdir(os.path.join(tdir, 'META-INF')) except EnvironmentError: pass with open(os.path.join(tdir, 'META-INF', 'container.xml'), 'wb') as f: f.write(simple_container_xml(os.path.basename(opf)).encode('utf-8')) from ebook_converter.ebooks.oeb.polish.container import EpubContainer container = EpubContainer(tdir, self.log) from ebook_converter.ebooks.oeb.polish.upgrade import epub_2_to_3 existing_nav = getattr(self.opts, 'epub3_nav_parsed', None) nav_href = getattr(self.opts, 'epub3_nav_href', None) previous_nav = (nav_href, existing_nav) if existing_nav and nav_href else None epub_2_to_3(container, self.log.info, previous_nav=previous_nav) fix_conversion_titlepage_links_in_nav(container) container.commit() os.remove(f.name) try: os.rmdir(os.path.join(tdir, 'META-INF')) except EnvironmentError: pass def encrypt_fonts(self, uris, tdir, _uuid): # {{{ from ebook_converter.polyglot.binary import from_hex_bytes key = re.sub(r'[^a-fA-F0-9]', '', _uuid) if len(key) < 16: raise ValueError('UUID identifier %r is invalid'% _uuid) key = bytearray(from_hex_bytes((key + key)[:32])) paths = [] with CurrentDir(tdir): paths = [os.path.join(*x.split('/')) for x in uris] uris = dict(zip(uris, paths)) fonts = [] for uri in list(uris.keys()): path = uris[uri] if not os.path.exists(path): uris.pop(uri) continue self.log.debug('Encrypting font:', uri) with open(path, 'r+b') as f: data = f.read(1024) if len(data) >= 1024: data = bytearray(data) f.seek(0) f.write(bytes(bytearray(data[i] ^ key[i%16] for i in range(1024)))) else: self.log.warn('Font', path, 'is invalid, ignoring') if not isinstance(uri, str): uri = uri.decode('utf-8') fonts.append(''' <enc:EncryptedData> <enc:EncryptionMethod Algorithm="http://ns.adobe.com/pdf/enc#RC"/> <enc:CipherData> <enc:CipherReference URI="%s"/> </enc:CipherData> </enc:EncryptedData> '''%(uri.replace('"', '\\"'))) if fonts: ans = '''<encryption xmlns="urn:oasis:names:tc:opendocument:xmlns:container" xmlns:enc="http://www.w3.org/2001/04/xmlenc#" xmlns:deenc="http://ns.adobe.com/digitaleditions/enc"> ''' ans += '\n'.join(fonts) ans += '\n</encryption>' return ans # }}} def condense_ncx(self, ncx_path): # {{{ from lxml import etree if not self.opts.pretty_print: tree = etree.parse(ncx_path) for tag in tree.getroot().iter(tag=etree.Element): if tag.text: tag.text = tag.text.strip() if tag.tail: tag.tail = tag.tail.strip() compressed = etree.tostring(tree.getroot(), encoding='utf-8') with open(ncx_path, 'wb') as f: f.write(compressed) # }}} def workaround_ade_quirks(self): # {{{ """ Perform various markup transforms to get the output to render correctly in the quirky ADE. """ stylesheet = self.oeb.manifest.main_stylesheet # ADE cries big wet tears when it encounters an invalid fragment # identifier in the NCX toc. frag_pat = re.compile(r'[-A-Za-z0-9_:.]+$') for node in self.oeb.toc.iter(): href = getattr(node, 'href', None) if hasattr(href, 'partition'): _base, _, frag = href.partition('#') frag = urllib.parse.unquote(frag) if frag and frag_pat.match(frag) is None: self.log.warn( 'Removing fragment identifier %r from TOC as Adobe Digital Editions cannot handle it'%frag) node.href = _base for x in self.oeb.spine: root = x.data body = base.XPath('//h:body')(root) if body: body = body[0] if hasattr(body, 'xpath'): # remove <img> tags with empty src elements bad = [] for x in base.XPath('//h:img')(body): src = x.get('src', '').strip() if src in ('', '#') or src.startswith('http:'): bad.append(x) for img in bad: img.getparent().remove(img) # Add id attribute to <a> tags that have name for x in base.XPath('//h:a[@name]')(body): if not x.get('id', False): x.set('id', x.get('name')) # The delightful epubcheck has started complaining about <a> tags that # have name attributes. x.attrib.pop('name') # Replace <br> that are children of <body> as ADE doesn't handle them for br in base.XPath('./h:br')(body): if br.getparent() is None: continue try: prior = next(br.itersiblings(preceding=True)) priortag = parse_utils.barename(prior.tag) priortext = prior.tail except: priortag = 'body' priortext = body.text if priortext: priortext = priortext.strip() br.tag = base.tag('xhtml', 'p') br.text = '\u00a0' style = br.get('style', '').split(';') style = list(filter(None, map(lambda x: x.strip(), style))) style.append('margin:0pt; border:0pt') # If the prior tag is a block (including a <br> we replaced) # then this <br> replacement should have a 1-line height. # Otherwise it should have no height. if not priortext and priortag in block_level_tags: style.append('height:1em') else: style.append('height:0pt') br.set('style', '; '.join(style)) for tag in base.XPath('//h:embed')(root): tag.getparent().remove(tag) for tag in base.XPath('//h:object')(root): if tag.get('type', '').lower().strip() in {'image/svg+xml', 'application/svg+xml'}: continue tag.getparent().remove(tag) for tag in base.XPath('//h:title|//h:style')(root): if not tag.text: tag.getparent().remove(tag) for tag in base.XPath('//h:script')(root): if (not tag.text and not tag.get('src', False) and tag.get('type', None) != 'text/x-mathjax-config'): tag.getparent().remove(tag) for tag in base.XPath('//h:body/descendant::h:script')(root): tag.getparent().remove(tag) formchildren = base.XPath('./h:input|./h:button|./h:textarea|' './h:label|./h:fieldset|./h:legend') for tag in base.XPath('//h:form')(root): if formchildren(tag): tag.getparent().remove(tag) else: # Not a real form tag.tag = base.tag('xhtml', 'div') for tag in base.XPath('//h:center')(root): tag.tag = base.tag('xhtml', 'div') tag.set('style', 'text-align:center') # ADE can't handle & in an img url for tag in base.XPath('//h:img[@src]')(root): tag.set('src', tag.get('src', '').replace('&', '')) # ADE whimpers in fright when it encounters a <td> outside a # <table> in_table = base.XPath('ancestor::h:table') for tag in base.XPath('//h:td|//h:tr|//h:th')(root): if not in_table(tag): tag.tag = base.tag('xhtml', 'div') # ADE fails to render non breaking hyphens/soft hyphens/zero width spaces special_chars = re.compile('[\u200b\u00ad]') for elem in root.iterdescendants('*'): if elem.text: elem.text = special_chars.sub('', elem.text) elem.text = elem.text.replace('\u2011', '-') if elem.tail: elem.tail = special_chars.sub('', elem.tail) elem.tail = elem.tail.replace('\u2011', '-') if stylesheet is not None: # ADE doesn't render lists correctly if they have left margins from css_parser.css import CSSRule for lb in base.XPath('//h:ul[@class]|//h:ol[@class]')(root): sel = '.'+lb.get('class') for rule in stylesheet.data.cssRules.rulesOfType(CSSRule.STYLE_RULE): if sel == rule.selectorList.selectorText: rule.style.removeProperty('margin-left') # padding-left breaks rendering in webkit and gecko rule.style.removeProperty('padding-left') # Change whitespace:pre to pre-wrap to accommodate readers that # cannot scroll horizontally for rule in stylesheet.data.cssRules.rulesOfType(CSSRule.STYLE_RULE): style = rule.style ws = style.getPropertyValue('white-space') if ws == 'pre': style.setProperty('white-space', 'pre-wrap') # }}} def workaround_sony_quirks(self): # {{{ ''' Perform toc link transforms to alleviate slow loading. ''' from ebook_converter.ebooks.oeb.polish.toc import item_at_top def frag_is_at_top(root, frag): elem = base.XPath('//*[@id="%s" or @name="%s"]'%(frag, frag))(root) if elem: elem = elem[0] else: return False return item_at_top(elem) def simplify_toc_entry(toc): if toc.href: href, frag = urllib.parse.urldefrag(toc.href) if frag: for x in self.oeb.spine: if x.href == href: if frag_is_at_top(x.data, frag): self.log.debug('Removing anchor from TOC href:', href+'#'+frag) toc.href = href break for x in toc: simplify_toc_entry(x) if self.oeb.toc: simplify_toc_entry(self.oeb.toc)
class RecipeInput(InputFormatPlugin): name = 'Recipe Input' author = 'Kovid Goyal' description = 'Download periodical content from the internet' file_types = {'recipe', 'downloaded_recipe'} commit_name = 'recipe_input' recommendations = { ('chapter', None, OptionRecommendation.HIGH), ('dont_split_on_page_breaks', True, OptionRecommendation.HIGH), ('use_auto_toc', False, OptionRecommendation.HIGH), ('input_encoding', None, OptionRecommendation.HIGH), ('input_profile', 'default', OptionRecommendation.HIGH), ('page_breaks_before', None, OptionRecommendation.HIGH), ('insert_metadata', False, OptionRecommendation.HIGH), } options = { OptionRecommendation(name='test', recommended_value=False, help='Useful for recipe development. Forces max_articles_per_feed ' 'to 2 and downloads at most 2 feeds. You can change the ' 'number of feeds and articles by supplying optional ' 'arguments. For example: --test 3 1 will download at most 3 ' 'feeds and only 1 article per feed.'), OptionRecommendation(name='username', recommended_value=None, help='Username for sites that require a login to access content.'), OptionRecommendation(name='password', recommended_value=None, help='Password for sites that require a login to access content.'), OptionRecommendation(name='dont_download_recipe', recommended_value=False, help='Do not download latest version of builtin recipes from the ' 'calibre server'), OptionRecommendation(name='lrf', recommended_value=False, help='Optimize fetching for subsequent conversion to LRF.'), } def convert(self, recipe_or_file, opts, file_ext, log, accelerators): from ebook_converter.web.feeds.recipes import compile_recipe opts.output_profile.flow_size = 0 if file_ext == 'downloaded_recipe': from ebook_converter.utils.zipfile import ZipFile zf = ZipFile(recipe_or_file, 'r') zf.extractall() zf.close() with open('download.recipe', 'rb') as f: self.recipe_source = f.read() recipe = compile_recipe(self.recipe_source) recipe.needs_subscription = False self.recipe_object = recipe(opts, log, self.report_progress) else: if os.environ.get('CALIBRE_RECIPE_URN'): from ebook_converter.web.feeds.recipes.collection import get_custom_recipe, get_builtin_recipe_by_id urn = os.environ['CALIBRE_RECIPE_URN'] log('Downloading recipe urn: ' + urn) rtype, recipe_id = urn.partition(':')[::2] if not recipe_id: raise ValueError('Invalid recipe urn: ' + urn) if rtype == 'custom': self.recipe_source = get_custom_recipe(recipe_id) else: self.recipe_source = get_builtin_recipe_by_id(urn, log=log, download_recipe=True) if not self.recipe_source: raise ValueError('Could not find recipe with urn: ' + urn) if not isinstance(self.recipe_source, bytes): self.recipe_source = self.recipe_source.encode('utf-8') recipe = compile_recipe(self.recipe_source) elif os.access(recipe_or_file, os.R_OK): with open(recipe_or_file, 'rb') as f: self.recipe_source = f.read() recipe = compile_recipe(self.recipe_source) log('Using custom recipe') else: from ebook_converter.web.feeds.recipes.collection import ( get_builtin_recipe_by_title, get_builtin_recipe_titles) title = getattr(opts, 'original_recipe_input_arg', recipe_or_file) title = os.path.basename(title).rpartition('.')[0] titles = frozenset(get_builtin_recipe_titles()) if title not in titles: title = getattr(opts, 'original_recipe_input_arg', recipe_or_file) title = title.rpartition('.')[0] raw = get_builtin_recipe_by_title(title, log=log, download_recipe=not opts.dont_download_recipe) builtin = False try: recipe = compile_recipe(raw) self.recipe_source = raw if recipe.requires_version > numeric_version: log.warn( 'Downloaded recipe needs calibre version at least: %s' % ('.'.join(recipe.requires_version))) builtin = True except: log.exception('Failed to compile downloaded recipe. Falling ' 'back to builtin one') builtin = True if builtin: log('Using bundled builtin recipe') raw = get_builtin_recipe_by_title(title, log=log, download_recipe=False) if raw is None: raise ValueError('Failed to find builtin recipe: '+title) recipe = compile_recipe(raw) self.recipe_source = raw else: log('Using downloaded builtin recipe') if recipe is None: raise ValueError('%r is not a valid recipe file or builtin recipe' % recipe_or_file) disabled = getattr(recipe, 'recipe_disabled', None) if disabled is not None: raise RecipeDisabled(disabled) ro = recipe(opts, log, self.report_progress) ro.download() self.recipe_object = ro for key, val in self.recipe_object.conversion_options.items(): setattr(opts, key, val) for f in os.listdir('.'): if f.endswith('.opf'): return os.path.abspath(f) for f in walk('.'): if f.endswith('.opf'): return os.path.abspath(f) def postprocess_book(self, oeb, opts, log): if self.recipe_object is not None: self.recipe_object.internal_postprocess_book(oeb, opts, log) self.recipe_object.postprocess_book(oeb, opts, log) def specialize(self, oeb, opts, log, output_fmt): if opts.no_inline_navbars: from ebook_converter.ebooks.oeb.base import XPath for item in oeb.spine: for div in XPath('//h:div[contains(@class, "calibre_navbar")]')(item.data): div.getparent().remove(div) def save_download(self, zf): raw = self.recipe_source if isinstance(raw, str): raw = raw.encode('utf-8') zf.writestr('download.recipe', raw)
class HTMLOutput(OutputFormatPlugin): name = 'HTML Output' author = 'Fabian Grassl' file_type = 'zip' commit_name = 'html_output' options = { OptionRecommendation( name='template_css', help='CSS file used for the output instead of the default file'), OptionRecommendation( name='template_html_index', help= 'Template used for generation of the HTML index file instead of the default file' ), OptionRecommendation( name='template_html', help= 'Template used for the generation of the HTML contents of the book instead of the default file' ), OptionRecommendation( name='extract_to', help='Extract the contents of the generated ZIP file to the ' 'specified directory. WARNING: The contents of the directory ' 'will be deleted.'), } recommendations = {('pretty_print', True, OptionRecommendation.HIGH)} def generate_toc(self, oeb_book, ref_url, output_dir): ''' Generate table of contents ''' with CurrentDir(output_dir): def build_node(current_node, parent=None): if parent is None: parent = etree.Element('ul') elif len(current_node.nodes): parent = element(parent, ('ul')) for node in current_node.nodes: point = element(parent, 'li') href = relpath(os.path.abspath(unquote(node.href)), os.path.dirname(ref_url)) if isinstance(href, bytes): href = href.decode('utf-8') link = element(point, 'a', href=clean_xml_chars(href)) title = node.title if isinstance(title, bytes): title = title.decode('utf-8') if title: title = re.sub(r'\s+', ' ', title) link.text = clean_xml_chars(title) build_node(node, point) return parent wrap = etree.Element('div') wrap.append(build_node(oeb_book.toc)) return wrap def generate_html_toc(self, oeb_book, ref_url, output_dir): from lxml import etree root = self.generate_toc(oeb_book, ref_url, output_dir) return etree.tostring(root, pretty_print=True, encoding='unicode', xml_declaration=False) def convert(self, oeb_book, output_path, input_plugin, opts, log): from lxml import etree from ebook_converter.utils import zipfile from templite import Templite from ebook_converter.polyglot.urllib import unquote from ebook_converter.ebooks.html.meta import EasyMeta # read template files if opts.template_html_index is not None: with open(opts.template_html_index, 'rb') as f: template_html_index_data = f.read() else: with open( pkg_resources.resource_filename( 'ebook_converter', 'data/html_export_default_index.tmpl')) as fobj: template_html_index_data = fobj.read().decode() if opts.template_html is not None: with open(opts.template_html, 'rb') as f: template_html_data = f.read() else: with open( pkg_resources.resource_filename( 'ebook_converter', 'data/html_export_default.tmpl')) as fobj: template_html_data = fobj.read().decode() if opts.template_css is not None: with open(opts.template_css, 'rb') as f: template_css_data = f.read() else: with open( pkg_resources.resource_filename( 'ebook_converter', 'data/html_export_default.css')) as fobj: template_css_data = fobj.read().decode() template_html_index_data = template_html_index_data.decode('utf-8') template_html_data = template_html_data.decode('utf-8') template_css_data = template_css_data.decode('utf-8') self.log = log self.opts = opts meta = EasyMeta(oeb_book.metadata) tempdir = os.path.realpath(PersistentTemporaryDirectory()) output_file = os.path.join( tempdir, os.path.basename(re.sub(r'\.zip', '', output_path) + '.html')) output_dir = re.sub(r'\.html', '', output_file) + '_files' if not os.path.exists(output_dir): os.makedirs(output_dir) css_path = output_dir + os.sep + 'calibreHtmlOutBasicCss.css' with open(css_path, 'wb') as f: f.write(template_css_data.encode('utf-8')) with open(output_file, 'wb') as f: html_toc = self.generate_html_toc(oeb_book, output_file, output_dir) templite = Templite(template_html_index_data) nextLink = oeb_book.spine[0].href nextLink = relpath(output_dir + os.sep + nextLink, os.path.dirname(output_file)) cssLink = relpath(os.path.abspath(css_path), os.path.dirname(output_file)) tocUrl = relpath(output_file, os.path.dirname(output_file)) t = templite.render(has_toc=bool(oeb_book.toc.count()), toc=html_toc, meta=meta, nextLink=nextLink, tocUrl=tocUrl, cssLink=cssLink, firstContentPageLink=nextLink) if isinstance(t, str): t = t.encode('utf-8') f.write(t) with CurrentDir(output_dir): for item in oeb_book.manifest: path = os.path.abspath(unquote(item.href)) dir = os.path.dirname(path) if not os.path.exists(dir): os.makedirs(dir) if item.spine_position is not None: with open(path, 'wb') as f: pass else: with open(path, 'wb') as f: f.write(item.bytes_representation) item.unload_data_from_memory(memory=path) for item in oeb_book.spine: path = os.path.abspath(unquote(item.href)) dir = os.path.dirname(path) root = item.data.getroottree() # get & clean HTML <HEAD>-data head = root.xpath( '//h:head', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0] head_content = etree.tostring(head, pretty_print=True, encoding='unicode') head_content = re.sub(r'\<\/?head.*\>', '', head_content) head_content = re.sub( re.compile(r'\<style.*\/style\>', re.M | re.S), '', head_content) head_content = re.sub(r'<(title)([^>]*)/>', r'<\1\2></\1>', head_content) # get & clean HTML <BODY>-data body = root.xpath( '//h:body', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0] ebook_content = etree.tostring(body, pretty_print=True, encoding='unicode') ebook_content = re.sub(r'\<\/?body.*\>', '', ebook_content) ebook_content = re.sub(r'<(div|a|span)([^>]*)/>', r'<\1\2></\1>', ebook_content) # generate link to next page if item.spine_position + 1 < len(oeb_book.spine): nextLink = oeb_book.spine[item.spine_position + 1].href nextLink = relpath(os.path.abspath(nextLink), dir) else: nextLink = None # generate link to previous page if item.spine_position > 0: prevLink = oeb_book.spine[item.spine_position - 1].href prevLink = relpath(os.path.abspath(prevLink), dir) else: prevLink = None cssLink = relpath(os.path.abspath(css_path), dir) tocUrl = relpath(output_file, dir) firstContentPageLink = oeb_book.spine[0].href # render template templite = Templite(template_html_data) toc = lambda: self.generate_html_toc(oeb_book, path, output_dir ) t = templite.render(ebookContent=ebook_content, prevLink=prevLink, nextLink=nextLink, has_toc=bool(oeb_book.toc.count()), toc=toc, tocUrl=tocUrl, head_content=head_content, meta=meta, cssLink=cssLink, firstContentPageLink=firstContentPageLink) # write html to file with open(path, 'wb') as f: f.write(t.encode('utf-8')) item.unload_data_from_memory(memory=path) zfile = zipfile.ZipFile(output_path, "w") zfile.add_dir(output_dir, os.path.basename(output_dir)) zfile.write(output_file, os.path.basename(output_file), zipfile.ZIP_DEFLATED) if opts.extract_to: if os.path.exists(opts.extract_to): shutil.rmtree(opts.extract_to) os.makedirs(opts.extract_to) zfile.extractall(opts.extract_to) self.log('Zip file extracted to', opts.extract_to) zfile.close() # cleanup temp dir shutil.rmtree(tempdir)
class PDFInput(InputFormatPlugin): name = 'PDF Input' author = 'Kovid Goyal and John Schember' description = 'Convert PDF files to HTML' file_types = {'pdf'} commit_name = 'pdf_input' options = { OptionRecommendation(name='no_images', recommended_value=False, help='Do not extract images from the document'), OptionRecommendation( name='unwrap_factor', recommended_value=0.45, help='Scale used to determine the length at which a line should ' 'be unwrapped. Valid values are a decimal between 0 and 1. The ' 'default is 0.45, just below the median line length.'), OptionRecommendation( name='new_pdf_engine', recommended_value=False, help='Use the new PDF conversion engine. Currently not operational.' ) } def convert_new(self, stream, accelerators): from ebook_converter.ebooks.pdf.pdftohtml import pdftohtml from ebook_converter.utils.cleantext import clean_ascii_chars from ebook_converter.ebooks.pdf.reflow import PDFDocument pdftohtml(os.getcwd(), stream.name, self.opts.no_images, as_xml=True) with open('index.xml', 'rb') as f: xml = clean_ascii_chars(f.read()) PDFDocument(xml, self.opts, self.log) return os.path.join(os.getcwd(), 'metadata.opf') def convert(self, stream, options, file_ext, log, accelerators): from ebook_converter.ebooks.metadata.opf2 import OPFCreator from ebook_converter.ebooks.pdf.pdftohtml import pdftohtml log.debug('Converting file to html...') # The main html file will be named index.html self.opts, self.log = options, log if options.new_pdf_engine: return self.convert_new(stream, accelerators) pdftohtml(os.getcwd(), stream.name, options.no_images) from ebook_converter.ebooks.metadata.meta import get_metadata log.debug('Retrieving document metadata...') mi = get_metadata(stream, 'pdf') opf = OPFCreator(os.getcwd(), mi) manifest = [('index.html', None)] images = os.listdir(os.getcwd()) images.remove('index.html') for i in images: manifest.append((i, None)) log.debug('Generating manifest...') opf.create_manifest(manifest) opf.create_spine(['index.html']) log.debug('Rendering manifest...') with open('metadata.opf', 'wb') as opffile: opf.render(opffile) if os.path.exists('toc.ncx'): ncxid = opf.manifest.id_for_path('toc.ncx') if ncxid: with open('metadata.opf', 'r+b') as f: raw = f.read().replace( b'<spine', b'<spine toc="%s"' % polyglot.as_bytes(ncxid)) f.seek(0) f.write(raw) return os.path.join(os.getcwd(), 'metadata.opf')
class HTMLZOutput(OutputFormatPlugin): name = 'HTMLZ Output' author = 'John Schember' file_type = 'htmlz' commit_name = 'htmlz_output' ui_data = {'css_choices': {'class': 'Use CSS classes', 'inline': 'Use the style attribute', 'tag': 'Use HTML tags wherever possible'}, 'sheet_choices': {'external': 'Use an external CSS file', 'inline': 'Use a <style> tag in the HTML ' 'file'}} options = { OptionRecommendation(name='htmlz_css_type', recommended_value='class', level=OptionRecommendation.LOW, choices=list(ui_data['css_choices']), help='Specify the handling of CSS. Default is class.\n' 'class: {class}\n' 'inline: {inline}\n' 'tag: {tag}'.format(**ui_data['css_choices'])), OptionRecommendation(name='htmlz_class_style', recommended_value='external', level=OptionRecommendation.LOW, choices=list(ui_data['sheet_choices']), help='How to handle the CSS when using css-type = \'class\'.\n' 'Default is external.\n' 'external: {external}\n' 'inline: {inline}'.format(**ui_data['sheet_choices'])), OptionRecommendation(name='htmlz_title_filename', recommended_value=False, level=OptionRecommendation.LOW, help='If set this option causes the file name of the HTML file ' 'inside the HTMLZ archive to be based on the book title.' ) } def convert(self, oeb_book, output_path, input_plugin, opts, log): from lxml import etree from ebook_converter.ebooks.oeb.base import OEB_IMAGES, SVG_MIME from ebook_converter.ebooks.metadata.opf2 import OPF, metadata_to_opf from ebook_converter.utils.zipfile import ZipFile from ebook_converter.utils.filenames import ascii_filename # HTML if opts.htmlz_css_type == 'inline': from ebook_converter.ebooks.htmlz.oeb2html import OEB2HTMLInlineCSSizer OEB2HTMLizer = OEB2HTMLInlineCSSizer elif opts.htmlz_css_type == 'tag': from ebook_converter.ebooks.htmlz.oeb2html import OEB2HTMLNoCSSizer OEB2HTMLizer = OEB2HTMLNoCSSizer else: from ebook_converter.ebooks.htmlz.oeb2html import OEB2HTMLClassCSSizer as OEB2HTMLizer with TemporaryDirectory(u'_htmlz_output') as tdir: htmlizer = OEB2HTMLizer(log) html = htmlizer.oeb2html(oeb_book, opts) fname = u'index' if opts.htmlz_title_filename: from ebook_converter.utils.filenames import shorten_components_to fname = shorten_components_to(100, (ascii_filename(str(oeb_book.metadata.title[0])),))[0] with open(os.path.join(tdir, fname+u'.html'), 'wb') as tf: if isinstance(html, str): html = html.encode('utf-8') tf.write(html) # CSS if opts.htmlz_css_type == 'class' and opts.htmlz_class_style == 'external': with open(os.path.join(tdir, u'style.css'), 'wb') as tf: tf.write(htmlizer.get_css(oeb_book)) # Images images = htmlizer.images if images: if not os.path.exists(os.path.join(tdir, u'images')): os.makedirs(os.path.join(tdir, u'images')) for item in oeb_book.manifest: if item.media_type in OEB_IMAGES and item.href in images: if item.media_type == SVG_MIME: data = etree.tostring(item.data, encoding='unicode') else: data = item.data fname = os.path.join(tdir, u'images', images[item.href]) with open(fname, 'wb') as img: img.write(data) # Cover cover_path = None try: cover_data = None if oeb_book.metadata.cover: term = oeb_book.metadata.cover[0].term cover_data = oeb_book.guide[term].item.data if cover_data: from ebook_converter.utils.img import save_cover_data_to cover_path = os.path.join(tdir, u'cover.jpg') with open(cover_path, 'w') as cf: cf.write('') save_cover_data_to(cover_data, cover_path) except: import traceback traceback.print_exc() # Metadata with open(os.path.join(tdir, u'metadata.opf'), 'wb') as mdataf: opf = OPF(io.BytesIO(etree.tostring(oeb_book.metadata.to_opf1(), encoding='UTF-8'))) mi = opf.to_book_metadata() if cover_path: mi.cover = u'cover.jpg' mdataf.write(metadata_to_opf(mi)) htmlz = ZipFile(output_path, 'w') htmlz.add_dir(tdir)
class AZW3Output(OutputFormatPlugin): name = 'AZW3 Output' author = 'Kovid Goyal' file_type = 'azw3' commit_name = 'azw3_output' options = { OptionRecommendation( name='prefer_author_sort', recommended_value=False, level=OptionRecommendation.LOW, help='When present, use author sort field as author.'), OptionRecommendation( name='no_inline_toc', recommended_value=False, level=OptionRecommendation.LOW, help='Don\'t add Table of Contents to the book. Useful if ' 'the book has its own table of contents.'), OptionRecommendation( name='toc_title', recommended_value=None, help='Title for any generated in-line table of contents.'), OptionRecommendation(name='dont_compress', recommended_value=False, level=OptionRecommendation.LOW, help='Disable compression of the file contents.'), OptionRecommendation( name='mobi_toc_at_start', recommended_value=False, help= 'When adding the Table of Contents to the book, add it at the start of the ' 'book instead of the end. Not recommended.'), OptionRecommendation( name='extract_to', help='Extract the contents of the generated %s file to the ' 'specified directory. The contents of the directory are first ' 'deleted, so be careful.' % 'AZW3'), OptionRecommendation( name='share_not_sync', recommended_value=False, help='Enable sharing of book content via Facebook etc. ' ' on the Kindle. WARNING: Using this feature means that ' ' the book will not auto sync its last read position ' ' on multiple devices. Complain to Amazon.') } def convert(self, oeb, output_path, input_plugin, opts, log): from ebook_converter.ebooks.mobi.writer2.resources import Resources from ebook_converter.ebooks.mobi.writer8.main import create_kf8_book from ebook_converter.ebooks.mobi.writer8.cleanup import remove_duplicate_anchors self.oeb, self.opts, self.log = oeb, opts, log opts.mobi_periodical = self.is_periodical passthrough = getattr(opts, 'mobi_passthrough', False) remove_duplicate_anchors(oeb) resources = Resources(self.oeb, self.opts, self.is_periodical, add_fonts=True, process_images=False) if not passthrough: remove_html_cover(self.oeb, self.log) # Split on pagebreaks so that the resulting KF8 is faster to load from ebook_converter.ebooks.oeb.transforms.split import Split Split()(self.oeb, self.opts) kf8 = create_kf8_book(self.oeb, self.opts, resources, for_joint=False) kf8.write(output_path) extract_mobi(output_path, opts) def specialize_css_for_output(self, log, opts, item, stylizer): from ebook_converter.ebooks.mobi.writer8.cleanup import CSSCleanup CSSCleanup(log, opts)(item, stylizer)
class LRFOutput(OutputFormatPlugin): name = 'LRF Output' author = 'Kovid Goyal' file_type = 'lrf' commit_name = 'lrf_output' options = { OptionRecommendation( name='enable_autorotation', recommended_value=False, help='Enable auto-rotation of images that are wider than the ' 'screen width.'), OptionRecommendation( name='wordspace', recommended_value=2.5, level=OptionRecommendation.LOW, help='Set the space between words in pts. Default is %default'), OptionRecommendation( name='header', recommended_value=False, help='Add a header to all the pages with title and author.'), OptionRecommendation( name='header_format', recommended_value="%t by %a", help='Set the format of the header. %a is replaced by the author ' 'and %t by the title. Default is %default'), OptionRecommendation( name='header_separation', recommended_value=0, help='Add extra spacing below the header. Default is %default pt.' ), OptionRecommendation( name='minimum_indent', recommended_value=0, help='Minimum paragraph indent (the indent of the first line ' 'of a paragraph) in pts. Default: %default'), OptionRecommendation(name='render_tables_as_images', recommended_value=False, help='This option has no effect'), OptionRecommendation( name='text_size_multiplier_for_rendered_tables', recommended_value=1.0, help='Multiply the size of text in rendered tables by this ' 'factor. Default is %default'), OptionRecommendation(name='serif_family', recommended_value=None, help='The serif family of fonts to embed'), OptionRecommendation(name='sans_family', recommended_value=None, help='The sans-serif family of fonts to embed'), OptionRecommendation(name='mono_family', recommended_value=None, help='The monospace family of fonts to embed'), } recommendations = {('change_justification', 'original', OptionRecommendation.HIGH)} def convert_images(self, pages, opts, wide): from ebook_converter.ebooks.lrf.pylrs.pylrs import Book, BookSetting, ImageStream, ImageBlock from uuid import uuid4 from ebook_converter.constants_old import __appname__, __version__ width, height = (784, 1012) if wide else (584, 754) ps = {} ps['topmargin'] = 0 ps['evensidemargin'] = 0 ps['oddsidemargin'] = 0 ps['textwidth'] = width ps['textheight'] = height book = Book(title=opts.title, author=opts.author, bookid=uuid4().hex, publisher='%s %s' % (__appname__, __version__), category='Comic', pagestyledefault=ps, booksetting=BookSetting(screenwidth=width, screenheight=height)) for page in pages: imageStream = ImageStream(page) _page = book.create_page() _page.append( ImageBlock(refstream=imageStream, blockwidth=width, blockheight=height, xsize=width, ysize=height, x1=width, y1=height)) book.append(_page) book.renderLrf(open(opts.output, 'wb')) def flatten_toc(self): from ebook_converter.ebooks.oeb.base import TOC nroot = TOC() for x in self.oeb.toc.iterdescendants(): nroot.add(x.title, x.href) self.oeb.toc = nroot def convert(self, oeb, output_path, input_plugin, opts, log): self.log, self.opts, self.oeb = log, opts, oeb lrf_opts = LRFOptions(output_path, opts, oeb) if input_plugin.is_image_collection: self.convert_images(input_plugin.get_images(), lrf_opts, getattr(opts, 'wide', False)) return self.flatten_toc() from ebook_converter.ptempfile import TemporaryDirectory with TemporaryDirectory('_lrf_output') as tdir: from ebook_converter.customize.ui import plugin_for_output_format oeb_output = plugin_for_output_format('oeb') oeb_output.convert(oeb, tdir, input_plugin, opts, log) opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0] from ebook_converter.ebooks.lrf.html.convert_from import process_file process_file(os.path.join(tdir, opf), lrf_opts, self.log)
class MOBIOutput(OutputFormatPlugin): name = 'MOBI Output' author = 'Kovid Goyal' file_type = 'mobi' commit_name = 'mobi_output' ui_data = {'file_types': ['old', 'both', 'new']} options = { OptionRecommendation( name='prefer_author_sort', recommended_value=False, level=OptionRecommendation.LOW, help='When present, use author sort field as author.'), OptionRecommendation( name='no_inline_toc', recommended_value=False, level=OptionRecommendation.LOW, help='Don\'t add Table of Contents to the book. Useful if ' 'the book has its own table of contents.'), OptionRecommendation( name='toc_title', recommended_value=None, help='Title for any generated in-line table of contents.'), OptionRecommendation(name='dont_compress', recommended_value=False, level=OptionRecommendation.LOW, help='Disable compression of the file contents.'), OptionRecommendation( name='personal_doc', recommended_value='[PDOC]', help='Tag for MOBI files to be marked as personal documents.' ' This option has no effect on the conversion. It is used' ' only when sending MOBI files to a device. If the file' ' being sent has the specified tag, it will be marked as' ' a personal document when sent to the Kindle.'), OptionRecommendation( name='mobi_ignore_margins', recommended_value=False, help='Ignore margins in the input document. If False, then ' 'the MOBI output plugin will try to convert margins specified' ' in the input document, otherwise it will ignore them.'), OptionRecommendation( name='mobi_toc_at_start', recommended_value=False, help= 'When adding the Table of Contents to the book, add it at the start of the ' 'book instead of the end. Not recommended.'), OptionRecommendation( name='extract_to', help='Extract the contents of the generated %s file to the ' 'specified directory. The contents of the directory are first ' 'deleted, so be careful.' % 'MOBI'), OptionRecommendation( name='share_not_sync', recommended_value=False, help='Enable sharing of book content via Facebook etc. ' ' on the Kindle. WARNING: Using this feature means that ' ' the book will not auto sync its last read position ' ' on multiple devices. Complain to Amazon.'), OptionRecommendation( name='mobi_keep_original_images', recommended_value=False, help='By default calibre converts all images to JPEG format ' 'in the output MOBI file. This is for maximum compatibility ' 'as some older MOBI viewers have problems with other image ' 'formats. This option tells calibre not to do this. ' 'Useful if your document contains lots of GIF/PNG images that ' 'become very large when converted to JPEG.'), OptionRecommendation( name='mobi_file_type', choices=ui_data['file_types'], recommended_value='old', help='By default calibre generates MOBI files that contain the ' 'old MOBI 6 format. This format is compatible with all ' 'devices. However, by changing this setting, you can tell ' 'calibre to generate MOBI files that contain both MOBI 6 and ' 'the new KF8 format, or only the new KF8 format. KF8 has ' 'more features than MOBI 6, but only works with newer Kindles. ' 'Allowed values: {}'.format('old, both, new')) } def check_for_periodical(self): if self.is_periodical: self.periodicalize_toc() self.check_for_masthead() self.opts.mobi_periodical = True else: self.opts.mobi_periodical = False def check_for_masthead(self): found = 'masthead' in self.oeb.guide if not found: from ebook_converter.ebooks import generate_masthead self.oeb.log.debug('No masthead found in manifest, generating ' 'default mastheadImage...') raw = generate_masthead(str(self.oeb.metadata['title'][0])) id, href = self.oeb.manifest.generate('masthead', 'masthead') self.oeb.manifest.add(id, href, 'image/gif', data=raw) self.oeb.guide.add('masthead', 'Masthead Image', href) else: self.oeb.log.debug('Using mastheadImage supplied in manifest...') def periodicalize_toc(self): from ebook_converter.ebooks.oeb.base import TOC toc = self.oeb.toc if not toc or len(self.oeb.spine) < 3: return if toc and toc[0].klass != 'periodical': one, two = self.oeb.spine[0], self.oeb.spine[1] self.log.info('Converting TOC for MOBI periodical indexing...') articles = {} if toc.depth() < 3: # single section periodical self.oeb.manifest.remove(one) self.oeb.manifest.remove(two) sections = [ TOC(klass='section', title='All articles', href=self.oeb.spine[0].href) ] for x in toc: sections[0].nodes.append(x) else: # multi-section periodical self.oeb.manifest.remove(one) sections = list(toc) for i, x in enumerate(sections): x.klass = 'section' articles_ = list(x) if articles_: self.oeb.manifest.remove( self.oeb.manifest.hrefs[x.href]) x.href = articles_[0].href for sec in sections: articles[id(sec)] = [] for a in list(sec): a.klass = 'article' articles[id(sec)].append(a) sec.nodes.remove(a) root = TOC(klass='periodical', href=self.oeb.spine[0].href, title=str(self.oeb.metadata.title[0])) for s in sections: if articles[id(s)]: for a in articles[id(s)]: s.nodes.append(a) root.nodes.append(s) for x in list(toc.nodes): toc.nodes.remove(x) toc.nodes.append(root) # Fix up the periodical href to point to first section href toc.nodes[0].href = toc.nodes[0].nodes[0].href def convert(self, oeb, output_path, input_plugin, opts, log): from ebook_converter.ebooks.mobi.writer2.resources import Resources self.log, self.opts, self.oeb = log, opts, oeb mobi_type = opts.mobi_file_type if self.is_periodical: mobi_type = 'old' # Amazon does not support KF8 periodicals create_kf8 = mobi_type in ('new', 'both') remove_html_cover(self.oeb, self.log) resources = Resources(oeb, opts, self.is_periodical, add_fonts=create_kf8) self.check_for_periodical() if create_kf8: from ebook_converter.ebooks.mobi.writer8.cleanup import remove_duplicate_anchors remove_duplicate_anchors(self.oeb) # Split on pagebreaks so that the resulting KF8 is faster to load from ebook_converter.ebooks.oeb.transforms.split import Split Split()(self.oeb, self.opts) kf8 = self.create_kf8(resources, for_joint=mobi_type == 'both') if create_kf8 else None if mobi_type == 'new': kf8.write(output_path) extract_mobi(output_path, opts) return self.log.info('Creating MOBI 6 output') self.write_mobi(input_plugin, output_path, kf8, resources) def create_kf8(self, resources, for_joint=False): from ebook_converter.ebooks.mobi.writer8.main import create_kf8_book return create_kf8_book(self.oeb, self.opts, resources, for_joint=for_joint) def write_mobi(self, input_plugin, output_path, kf8, resources): from ebook_converter.ebooks.mobi.mobiml import MobiMLizer from ebook_converter.ebooks.oeb.transforms.manglecase import CaseMangler from ebook_converter.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable from ebook_converter.ebooks.oeb.transforms.htmltoc import HTMLTOCAdder from ebook_converter.customize.ui import plugin_for_input_format opts, oeb = self.opts, self.oeb if not opts.no_inline_toc: tocadder = HTMLTOCAdder( title=opts.toc_title, position='start' if opts.mobi_toc_at_start else 'end') tocadder(oeb, opts) mangler = CaseMangler() mangler(oeb, opts) try: rasterizer = SVGRasterizer() rasterizer(oeb, opts) except Unavailable: self.log.warning('SVG rasterizer unavailable, SVG will not be ' 'converted') else: # Add rasterized SVG images resources.add_extra_images() if hasattr(self.oeb, 'inserted_metadata_jacket'): self.workaround_fire_bugs(self.oeb.inserted_metadata_jacket) mobimlizer = MobiMLizer(ignore_tables=opts.linearize_tables) mobimlizer(oeb, opts) write_page_breaks_after_item = input_plugin is not plugin_for_input_format( 'cbz') from ebook_converter.ebooks.mobi.writer2.main import MobiWriter writer = MobiWriter( opts, resources, kf8, write_page_breaks_after_item=write_page_breaks_after_item) writer(oeb, output_path) extract_mobi(output_path, opts) def specialize_css_for_output(self, log, opts, item, stylizer): from ebook_converter.ebooks.mobi.writer8.cleanup import CSSCleanup CSSCleanup(log, opts)(item, stylizer) def workaround_fire_bugs(self, jacket): # The idiotic Fire crashes when trying to render the table used to # layout the jacket from ebook_converter.ebooks.oeb.base import XHTML for table in jacket.data.xpath('//*[local-name()="table"]'): table.tag = XHTML('div') for tr in table.xpath('descendant::*[local-name()="tr"]'): cols = tr.xpath('descendant::*[local-name()="td"]') tr.tag = XHTML('div') for td in cols: td.tag = XHTML('span' if cols else 'div')
class TXTOutput(OutputFormatPlugin): name = 'TXT Output' author = 'John Schember' file_type = 'txt' commit_name = 'txt_output' ui_data = { 'newline_types': NEWLINE_TYPES, 'formatting_types': { 'plain': 'Plain text', 'markdown': 'Markdown formatted text', 'textile': 'TexTile formatted text' }, } options = { OptionRecommendation( name='newline', recommended_value='system', level=OptionRecommendation.LOW, short_switch='n', choices=NEWLINE_TYPES, help= 'Type of newline to use. Options are %s. Default is \'system\'. ' 'Use \'old_mac\' for compatibility with Mac OS 9 and earlier. ' 'For macOS use \'unix\'. \'system\' will default to the newline ' 'type used by this OS.' % sorted(NEWLINE_TYPES)), OptionRecommendation( name='txt_output_encoding', recommended_value='utf-8', level=OptionRecommendation.LOW, help='Specify the character encoding of the output document. ' 'The default is utf-8.'), OptionRecommendation( name='inline_toc', recommended_value=False, level=OptionRecommendation.LOW, help='Add Table of Contents to beginning of the book.'), OptionRecommendation( name='max_line_length', recommended_value=0, level=OptionRecommendation.LOW, help='The maximum number of characters per line. This splits on ' 'the first space before the specified value. If no space is ' 'found the line will be broken at the space after and will ' 'exceed the specified value. Also, there is a minimum of 25 ' 'characters. Use 0 to disable line splitting.'), OptionRecommendation( name='force_max_line_length', recommended_value=False, level=OptionRecommendation.LOW, help='Force splitting on the max-line-length value when no space ' 'is present. Also allows max-line-length to be below the ' 'minimum'), OptionRecommendation(name='txt_output_formatting', recommended_value='plain', choices=list(ui_data['formatting_types']), help='Formatting used within the document.\n' '* plain: {plain}\n' '* markdown: {markdown}\n' '* textile: {textile}' ''.format(**ui_data['formatting_types'])), OptionRecommendation( name='keep_links', recommended_value=False, level=OptionRecommendation.LOW, help='Do not remove links within the document. This is only ' 'useful when paired with a txt-output-formatting option that ' 'is not none because links are always removed with plain ' 'text output.'), OptionRecommendation( name='keep_image_references', recommended_value=False, level=OptionRecommendation.LOW, help='Do not remove image references within the document. This is ' 'only useful when paired with a txt-output-formatting option ' 'that is not none because links are always removed with ' 'plain text output.'), OptionRecommendation( name='keep_color', recommended_value=False, level=OptionRecommendation.LOW, help='Do not remove font color from output. This is only useful ' 'when txt-output-formatting is set to textile. Textile is ' 'the only formatting that supports setting font color. If ' 'this option is not specified font color will not be set and ' 'default to the color displayed by the reader (generally ' 'this is black).') } def convert(self, oeb_book, output_path, input_plugin, opts, log): from ebook_converter.ebooks.txt.txtml import TXTMLizer from ebook_converter.utils.cleantext import clean_ascii_chars from ebook_converter.ebooks.txt.newlines import specified_newlines, TxtNewlines if opts.txt_output_formatting.lower() == 'markdown': from ebook_converter.ebooks.txt.markdownml import MarkdownMLizer self.writer = MarkdownMLizer(log) elif opts.txt_output_formatting.lower() == 'textile': from ebook_converter.ebooks.txt.textileml import TextileMLizer self.writer = TextileMLizer(log) else: self.writer = TXTMLizer(log) txt = self.writer.extract_content(oeb_book, opts) txt = clean_ascii_chars(txt) log.debug('\tReplacing newlines with selected type...') txt = specified_newlines(TxtNewlines(opts.newline).newline, txt) close = False if not hasattr(output_path, 'write'): close = True if not os.path.exists(os.path.dirname( output_path)) and os.path.dirname(output_path) != '': os.makedirs(os.path.dirname(output_path)) out_stream = open(output_path, 'wb') else: out_stream = output_path out_stream.seek(0) out_stream.truncate() out_stream.write(txt.encode(opts.txt_output_encoding, 'replace')) if close: out_stream.close()
class HTMLInput(InputFormatPlugin): name = 'HTML Input' author = 'Kovid Goyal' description = 'Convert HTML and OPF files to an OEB' file_types = {'opf', 'html', 'htm', 'xhtml', 'xhtm', 'shtm', 'shtml'} commit_name = 'html_input' options = { OptionRecommendation(name='breadth_first', recommended_value=False, level=OptionRecommendation.LOW, help='Traverse links in HTML files breadth first. Normally, ' 'they are traversed depth first.' ), OptionRecommendation(name='max_levels', recommended_value=5, level=OptionRecommendation.LOW, help='Maximum levels of recursion when following links in ' 'HTML files. Must be non-negative. 0 implies that no ' 'links in the root HTML file are followed. Default is ' '%default.' ), OptionRecommendation(name='dont_package', recommended_value=False, level=OptionRecommendation.LOW, help='Normally this input plugin re-arranges all the input ' 'files into a standard folder hierarchy. Only use this option ' 'if you know what you are doing as it can result in various ' 'nasty side effects in the rest of the conversion pipeline.' ), } def convert(self, stream, opts, file_ext, log, accelerators): basedir = os.getcwd() self.opts = opts fname = None if hasattr(stream, 'name'): basedir = os.path.dirname(stream.name) fname = os.path.basename(stream.name) if file_ext != 'opf': if opts.dont_package: raise ValueError('The --dont-package option is not supported for an HTML input file') from ebook_converter.ebooks.metadata.html import get_metadata mi = get_metadata(stream) if fname: from ebook_converter.ebooks.metadata.meta import metadata_from_filename fmi = metadata_from_filename(fname) fmi.smart_update(mi) mi = fmi oeb = self.create_oebbook(stream.name, basedir, opts, log, mi) return oeb from ebook_converter.ebooks.conversion.plumber import create_oebbook return create_oebbook(log, stream.name, opts, encoding=opts.input_encoding) def create_oebbook(self, htmlpath, basedir, opts, log, mi): import uuid from ebook_converter.ebooks.conversion.plumber import create_oebbook from ebook_converter.ebooks.oeb.base import (DirContainer, rewrite_links, urlnormalize, BINARY_MIME, OEB_STYLES, xpath, urlquote) from ebook_converter.ebooks.oeb.transforms.metadata import \ meta_info_to_oeb_metadata from ebook_converter.ebooks.html.input import get_filelist from ebook_converter.ebooks.metadata import string_to_authors from ebook_converter.utils.localization import canonicalize_lang import css_parser, logging css_parser.log.setLevel(logging.WARN) self.OEB_STYLES = OEB_STYLES oeb = create_oebbook(log, None, opts, self, encoding=opts.input_encoding, populate=False) self.oeb = oeb metadata = oeb.metadata meta_info_to_oeb_metadata(mi, metadata, log) if not metadata.language: l = canonicalize_lang(getattr(opts, 'language', None)) if not l: oeb.logger.warn('Language not specified') l = get_lang().replace('_', '-') metadata.add('language', l) if not metadata.creator: a = getattr(opts, 'authors', None) if a: a = string_to_authors(a) if not a: oeb.logger.warn('Creator not specified') a = [self.oeb.translate('Unknown')] for aut in a: metadata.add('creator', aut) if not metadata.title: oeb.logger.warn('Title not specified') metadata.add('title', self.oeb.translate('Unknown')) bookid = str(uuid.uuid4()) metadata.add('identifier', bookid, id='uuid_id', scheme='uuid') for ident in metadata.identifier: if 'id' in ident.attrib: self.oeb.uid = metadata.identifier[0] break filelist = get_filelist(htmlpath, basedir, opts, log) filelist = [f for f in filelist if not f.is_binary] htmlfile_map = {} for f in filelist: path = f.path oeb.container = DirContainer(os.path.dirname(path), log, ignore_opf=True) bname = os.path.basename(path) id, href = oeb.manifest.generate(id='html', href=sanitize_file_name(bname)) htmlfile_map[path] = href item = oeb.manifest.add(id, href, 'text/html') if path == htmlpath and '%' in path: bname = urlquote(bname) item.html_input_href = bname oeb.spine.add(item, True) self.added_resources = {} self.log = log self.log('Normalizing filename cases') for path, href in htmlfile_map.items(): self.added_resources[path] = href self.urlnormalize, self.DirContainer = urlnormalize, DirContainer self.urldefrag = urllib.parse.urldefrag self.BINARY_MIME = BINARY_MIME self.log('Rewriting HTML links') for f in filelist: path = f.path dpath = os.path.dirname(path) oeb.container = DirContainer(dpath, log, ignore_opf=True) href = htmlfile_map[path] try: item = oeb.manifest.hrefs[href] except KeyError: item = oeb.manifest.hrefs[urlnormalize(href)] rewrite_links(item.data, functools.partial(self.resource_adder, base=dpath)) for item in oeb.manifest.values(): if item.media_type in self.OEB_STYLES: dpath = None for path, href in self.added_resources.items(): if href == item.href: dpath = os.path.dirname(path) break css_parser.replaceUrls(item.data, functools.partial(self.resource_adder, base=dpath)) toc = self.oeb.toc self.oeb.auto_generated_toc = True titles = [] headers = [] for item in self.oeb.spine: if not item.linear: continue html = item.data title = ''.join(xpath(html, '/h:html/h:head/h:title/text()')) title = re.sub(r'\s+', ' ', title.strip()) if title: titles.append(title) headers.append('(unlabled)') for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'): expr = '/h:html/h:body//h:%s[position()=1]/text()' header = ''.join(xpath(html, expr % tag)) header = re.sub(r'\s+', ' ', header.strip()) if header: headers[-1] = header break use = titles if len(titles) > len(set(titles)): use = headers for title, item in zip(use, self.oeb.spine): if not item.linear: continue toc.add(title, item.href) oeb.container = DirContainer(os.getcwd(), oeb.log, ignore_opf=True) return oeb def link_to_local_path(self, link_, base=None): from ebook_converter.ebooks.html.input import Link if not isinstance(link_, str): try: link_ = link_.decode('utf-8', 'error') except: self.log.warn('Failed to decode link %r. Ignoring'%link_) return None, None try: l = Link(link_, base if base else os.getcwd()) except: self.log.exception('Failed to process link: %r'%link_) return None, None if l.path is None: # Not a local resource return None, None link = l.path.replace('/', os.sep).strip() frag = l.fragment if not link: return None, None return link, frag def resource_adder(self, link_, base=None): link, frag = self.link_to_local_path(link_, base=base) if link is None: return link_ try: if base and not os.path.isabs(link): link = os.path.join(base, link) link = os.path.abspath(link) except: return link_ if not os.access(link, os.R_OK): return link_ if os.path.isdir(link): self.log.warn(link_, 'is a link to a directory. Ignoring.') return link_ if link not in self.added_resources: bhref = os.path.basename(link) id, href = self.oeb.manifest.generate(id='added', href=sanitize_file_name(bhref)) guessed = mimetypes.guess_type(href)[0] media_type = guessed or self.BINARY_MIME if media_type == 'text/plain': self.log.warn('Ignoring link to text file %r'%link_) return None if media_type == self.BINARY_MIME: # Check for the common case, images try: img = what(link) except EnvironmentError: pass else: if img: media_type = mimetypes.guess_type('dummy.'+img)[0] or self.BINARY_MIME self.oeb.log.debug('Added', link) self.oeb.container = self.DirContainer(os.path.dirname(link), self.oeb.log, ignore_opf=True) # Load into memory item = self.oeb.manifest.add(id, href, media_type) # bhref refers to an already existing file. The read() method of # DirContainer will call unquote on it before trying to read the # file, therefore we quote it here. # XXX(gryf): why the heck it was changed to bytes? item.html_input_href = urllib.parse.quote(bhref) if guessed in self.OEB_STYLES: item.override_css_fetch = functools.partial( self.css_import_handler, os.path.dirname(link)) item.data self.added_resources[link] = href nlink = self.added_resources[link] if frag: nlink = '#'.join((nlink, frag)) return nlink def css_import_handler(self, base, href): link, frag = self.link_to_local_path(href, base=base) if link is None or not os.access(link, os.R_OK) or os.path.isdir(link): return (None, None) try: with open(link, 'rb') as f: raw = f.read().decode('utf-8', 'replace') raw = self.oeb.css_preprocessor(raw, add_namespace=False) except: self.log.exception('Failed to read CSS file: %r'%link) return (None, None) return (None, raw)
class FB2Input(InputFormatPlugin): name = 'FB2 Input' author = 'Anatoly Shipitsin' description = 'Convert FB2 and FBZ files to HTML' file_types = {'fb2', 'fbz'} commit_name = 'fb2_input' recommendations = {('level1_toc', '//h:h1', OptionRecommendation.MED), ('level2_toc', '//h:h2', OptionRecommendation.MED), ('level3_toc', '//h:h3', OptionRecommendation.MED)} options = { OptionRecommendation(name='no_inline_fb2_toc', recommended_value=False, level=OptionRecommendation.LOW, help='Do not insert a Table of Contents ' 'at the beginning of the book.') } def convert(self, stream, options, file_ext, log, accelerators): from ebook_converter.ebooks.metadata.fb2 import ensure_namespace from ebook_converter.ebooks.metadata.fb2 import get_fb2_data from ebook_converter.ebooks.metadata.opf2 import OPFCreator from ebook_converter.ebooks.metadata.meta import get_metadata from ebook_converter.ebooks.chardet import xml_to_unicode self.log = log log.debug('Parsing XML...') raw = get_fb2_data(stream)[0] raw = raw.replace(b'\0', b'') raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True, resolve_entities=True)[0] try: doc = etree.fromstring(raw) except etree.XMLSyntaxError: doc = etree.fromstring(raw.replace('& ', '&')) if doc is None: raise ValueError('The FB2 file is not valid XML') doc = ensure_namespace(doc) try: fb_ns = doc.nsmap[doc.prefix] except Exception: fb_ns = FB2NS NAMESPACES = {'f': fb_ns, 'l': const.XLINK_NS} stylesheets = doc.xpath('//*[local-name() = "stylesheet" and ' '@type="text/css"]') css = '' for s in stylesheets: css += etree.tostring( s, encoding='unicode', method='text', with_tail=False) + '\n\n' if css: import css_parser import logging parser = css_parser.CSSParser(fetcher=None, log=logging.getLogger('calibre.css')) XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % const.XHTML_NS text = XHTML_CSS_NAMESPACE + css log.debug('Parsing stylesheet...') stylesheet = parser.parseString(text) stylesheet.namespaces['h'] = const.XHTML_NS css = stylesheet.cssText if isinstance(css, bytes): css = css.decode('utf-8', 'replace') css = css.replace('h|style', 'h|span') css = re.sub(r'name\s*=\s*', 'class=', css) self.extract_embedded_content(doc) log.debug('Converting XML to HTML...') with open( pkg_resources.resource_filename('ebook_converter', 'data/fb2.xsl')) as f: ss = f.read() ss = ss.replace("__FB_NS__", fb_ns) if options.no_inline_fb2_toc: log.info('Disabling generation of inline FB2 TOC') ss = re.compile(r'<!-- BUILD TOC -->.*<!-- END BUILD TOC -->', re.DOTALL).sub('', ss) styledoc = etree.fromstring(ss) transform = etree.XSLT(styledoc) result = transform(doc) # Handle links of type note and cite notes = { a.get('href')[1:]: a for a in result.xpath('//a[@link_note and @href]') if a.get('href').startswith('#') } cites = { a.get('link_cite'): a for a in result.xpath('//a[@link_cite]') if not a.get('href', '') } all_ids = {x for x in result.xpath('//*/@id')} for cite, a in cites.items(): note = notes.get(cite, None) if note: c = 1 while 'cite%d' % c in all_ids: c += 1 if not note.get('id', None): note.set('id', 'cite%d' % c) all_ids.add(note.get('id')) a.set('href', '#%s' % note.get('id')) for x in result.xpath('//*[@link_note or @link_cite]'): x.attrib.pop('link_note', None) x.attrib.pop('link_cite', None) for img in result.xpath('//img[@src]'): src = img.get('src') img.set('src', self.binary_map.get(src, src)) index = transform.tostring(result) with open('index.xhtml', 'wb') as f: f.write(index.encode('utf-8')) with open('inline-styles.css', 'wb') as f: f.write(css.encode('utf-8')) stream.seek(0) mi = get_metadata(stream, 'fb2') if not mi.title: mi.title = 'Unknown' if not mi.authors: mi.authors = ['Unknown'] cpath = None if mi.cover_data and mi.cover_data[1]: with open('fb2_cover_calibre_mi.jpg', 'wb') as f: f.write(mi.cover_data[1]) cpath = os.path.abspath('fb2_cover_calibre_mi.jpg') else: for img in doc.xpath('//f:coverpage/f:image', namespaces=NAMESPACES): href = img.get('{%s}href' % const.XLINK_NS, img.get('href', None)) if href is not None: if href.startswith('#'): href = href[1:] cpath = os.path.abspath(href) break opf = OPFCreator(os.getcwd(), mi) entries = [(f2, mimetypes.guess_type(f2)[0]) for f2 in os.listdir(u'.')] opf.create_manifest(entries) opf.create_spine(['index.xhtml']) if cpath: opf.guide.set_cover(cpath) with open('metadata.opf', 'wb') as f: opf.render(f) return os.path.join(os.getcwd(), 'metadata.opf') def extract_embedded_content(self, doc): from ebook_converter.ebooks.fb2 import base64_decode self.binary_map = {} for elem in doc.xpath('./*'): if elem.text and 'binary' in elem.tag and 'id' in elem.attrib: ct = elem.get('content-type', '') fname = elem.attrib['id'] ext = ct.rpartition('/')[-1].lower() if ext in ('png', 'jpeg', 'jpg'): if fname.lower().rpartition('.')[-1] not in { 'jpg', 'jpeg', 'png' }: fname += '.' + ext self.binary_map[elem.get('id')] = fname raw = elem.text.strip() try: data = base64_decode(raw) except TypeError: self.log.exception( 'Binary data with id=%s is corrupted, ' 'ignoring', elem.get('id')) else: with open(fname, 'wb') as f: f.write(data)
class FB2Output(OutputFormatPlugin): name = 'FB2 Output' author = 'John Schember' file_type = 'fb2' commit_name = 'fb2_output' FB2_GENRES = [ # Science Fiction & Fantasy 'sf_history', # Alternative history 'sf_action', # Action 'sf_epic', # Epic 'sf_heroic', # Heroic 'sf_detective', # Detective 'sf_cyberpunk', # Cyberpunk 'sf_space', # Space 'sf_social', # Social#philosophical 'sf_horror', # Horror & mystic 'sf_humor', # Humor 'sf_fantasy', # Fantasy 'sf', # Science Fiction # Detectives & Thrillers 'det_classic', # Classical detectives 'det_police', # Police Stories 'det_action', # Action 'det_irony', # Ironical detectives 'det_history', # Historical detectives 'det_espionage', # Espionage detectives 'det_crime', # Crime detectives 'det_political', # Political detectives 'det_maniac', # Maniacs 'det_hard', # Hard#boiled 'thriller', # Thrillers 'detective', # Detectives # Prose 'prose_classic', # Classics prose 'prose_history', # Historical prose 'prose_contemporary', # Contemporary prose 'prose_counter', # Counterculture 'prose_rus_classic', # Russial classics prose 'prose_su_classics', # Soviet classics prose # Romance 'love_contemporary', # Contemporary Romance 'love_history', # Historical Romance 'love_detective', # Detective Romance 'love_short', # Short Romance 'love_erotica', # Erotica # Adventure 'adv_western', # Western 'adv_history', # History 'adv_indian', # Indians 'adv_maritime', # Maritime Fiction 'adv_geo', # Travel & geography 'adv_animal', # Nature & animals 'adventure', # Other # Children's 'child_tale', # Fairy Tales 'child_verse', # Verses 'child_prose', # Prose 'child_sf', # Science Fiction 'child_det', # Detectives & Thrillers 'child_adv', # Adventures 'child_education', # Educational 'children', # Other # Poetry & Dramaturgy 'poetry', # Poetry 'dramaturgy', # Dramaturgy # Antique literature 'antique_ant', # Antique 'antique_european', # European 'antique_russian', # Old russian 'antique_east', # Old east 'antique_myths', # Myths. Legends. Epos 'antique', # Other # Scientific#educational 'sci_history', # History 'sci_psychology', # Psychology 'sci_culture', # Cultural science 'sci_religion', # Religious studies 'sci_philosophy', # Philosophy 'sci_politics', # Politics 'sci_business', # Business literature 'sci_juris', # Jurisprudence 'sci_linguistic', # Linguistics 'sci_medicine', # Medicine 'sci_phys', # Physics 'sci_math', # Mathematics 'sci_chem', # Chemistry 'sci_biology', # Biology 'sci_tech', # Technical 'science', # Other # Computers & Internet 'comp_www', # Internet 'comp_programming', # Programming 'comp_hard', # Hardware 'comp_soft', # Software 'comp_db', # Databases 'comp_osnet', # OS & Networking 'computers', # Other # Reference 'ref_encyc', # Encyclopedias 'ref_dict', # Dictionaries 'ref_ref', # Reference 'ref_guide', # Guidebooks 'reference', # Other # Nonfiction 'nonf_biography', # Biography & Memoirs 'nonf_publicism', # Publicism 'nonf_criticism', # Criticism 'design', # Art & design 'nonfiction', # Other # Religion & Inspiration 'religion_rel', # Religion 'religion_esoterics', # Esoterics 'religion_self', # Self#improvement 'religion', # Other # Humor 'humor_anecdote', # Anecdote (funny stories) 'humor_prose', # Prose 'humor_verse', # Verses 'humor', # Other # Home & Family 'home_cooking', # Cooking 'home_pets', # Pets 'home_crafts', # Hobbies & Crafts 'home_entertain', # Entertaining 'home_health', # Health 'home_garden', # Garden 'home_diy', # Do it yourself 'home_sport', # Sports 'home_sex', # Erotica & sex 'home', # Other ] ui_data = {'sectionize': {'toc': 'Section per entry in the ToC', 'files': 'Section per file', 'nothing': 'A single section'}, 'genres': FB2_GENRES} options = { OptionRecommendation(name='sectionize', recommended_value='files', level=OptionRecommendation.LOW, choices=list(ui_data['sectionize']), help='Specify how sections are created:\n' ' * nothing: {nothing}\n' ' * files: {files}\n' ' * toc: {toc}\n' 'If ToC based generation fails, adjust the "Structure ' 'detection" and/or "Table of Contents" settings (turn on ' '"Force use of auto-generated Table of Contents")' '.'.format(**ui_data['sectionize']) ), OptionRecommendation(name='fb2_genre', recommended_value='antique', level=OptionRecommendation.LOW, choices=FB2_GENRES, help='Genre for the book. Choices: %s\n\n See: http://www.' 'fictionbook.org/index.php/Eng:FictionBook_2.1_genres for a ' 'complete list with descriptions.' % ', '.join(FB2_GENRES)), } def convert(self, oeb_book, output_path, input_plugin, opts, log): from ebook_converter.ebooks.oeb.transforms.jacket import linearize_jacket from ebook_converter.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable from ebook_converter.ebooks.fb2.fb2ml import FB2MLizer try: rasterizer = SVGRasterizer() rasterizer(oeb_book, opts) except Unavailable: log.warning('SVG rasterizer unavailable, SVG will not be ' 'converted') linearize_jacket(oeb_book) fb2mlizer = FB2MLizer(log) fb2_content = fb2mlizer.extract_content(oeb_book, opts) close = False if not hasattr(output_path, 'write'): close = True if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '': os.makedirs(os.path.dirname(output_path)) out_stream = open(output_path, 'wb') else: out_stream = output_path out_stream.seek(0) out_stream.truncate() out_stream.write(fb2_content.encode('utf-8', 'replace')) if close: out_stream.close()
class PMLOutput(OutputFormatPlugin): name = 'PML Output' author = 'John Schember' file_type = 'pmlz' commit_name = 'pml_output' options = { OptionRecommendation( name='pml_output_encoding', recommended_value='cp1252', level=OptionRecommendation.LOW, help='Specify the character encoding of the output document. ' 'The default is cp1252.'), OptionRecommendation( name='inline_toc', recommended_value=False, level=OptionRecommendation.LOW, help='Add Table of Contents to beginning of the book.'), OptionRecommendation( name='full_image_depth', recommended_value=False, level=OptionRecommendation.LOW, help='Do not reduce the size or bit depth of images. Images ' 'have their size and depth reduced by default to accommodate ' 'applications that can not convert images on their ' 'own such as Dropbook.'), } def convert(self, oeb_book, output_path, input_plugin, opts, log): from ebook_converter.ebooks.pml.pmlml import PMLMLizer from ebook_converter.utils.zipfile import ZipFile with TemporaryDirectory('_pmlz_output') as tdir: pmlmlizer = PMLMLizer(log) pml = str(pmlmlizer.extract_content(oeb_book, opts)) with open(os.path.join(tdir, 'index.pml'), 'wb') as out: out.write(pml.encode(opts.pml_output_encoding, 'replace')) img_path = os.path.join(tdir, 'index_img') if not os.path.exists(img_path): os.makedirs(img_path) self.write_images(oeb_book.manifest, pmlmlizer.image_hrefs, img_path, opts) log.debug('Compressing output...') pmlz = ZipFile(output_path, 'w') pmlz.add_dir(tdir) def write_images(self, manifest, image_hrefs, out_dir, opts): from PIL import Image from ebook_converter.ebooks.oeb.base import OEB_RASTER_IMAGES for item in manifest: if item.media_type in OEB_RASTER_IMAGES and item.href in image_hrefs.keys( ): if opts.full_image_depth: im = Image.open(io.BytesIO(item.data)) else: im = Image.open(io.BytesIO(item.data)).convert('P') im.thumbnail((300, 300), Image.ANTIALIAS) data = io.BytesIO() im.save(data, 'PNG') data = data.getvalue() path = os.path.join(out_dir, image_hrefs[item.href]) with open(path, 'wb') as out: out.write(data)