def convert(self, stream, opts, file_ext, log, accelerators): self._is_case_sensitive = None basedir = os.getcwdu() self.opts = opts fname = None if hasattr(stream, 'name'): basedir = os.path.dirname(stream.name) fname = os.path.basename(stream.name) if file_ext != 'opf': if opts.dont_package: raise ValueError('The --dont-package option is not supported for an HTML input file') from calibre.ebooks.metadata.html import get_metadata mi = get_metadata(stream) if fname: from calibre.ebooks.metadata.meta import metadata_from_filename fmi = metadata_from_filename(fname) fmi.smart_update(mi) mi = fmi oeb = self.create_oebbook(stream.name, basedir, opts, log, mi) return oeb from calibre.ebooks.conversion.plumber import create_oebbook return create_oebbook(log, stream.name, opts, encoding=opts.input_encoding)
def convert(self, stream, opts, file_ext, log, accelerators): self._is_case_sensitive = None basedir = getcwd() self.opts = opts fname = None if hasattr(stream, 'name'): basedir = os.path.dirname(stream.name) fname = os.path.basename(stream.name) if file_ext != 'opf': if opts.dont_package: raise ValueError( 'The --dont-package option is not supported for an HTML input file' ) from calibre.ebooks.metadata.html import get_metadata mi = get_metadata(stream) if fname: from calibre.ebooks.metadata.meta import metadata_from_filename fmi = metadata_from_filename(fname) fmi.smart_update(mi) mi = fmi oeb = self.create_oebbook(stream.name, basedir, opts, log, mi) return oeb from calibre.ebooks.conversion.plumber import create_oebbook return create_oebbook(log, stream.name, opts, encoding=opts.input_encoding)
def extract_book(pathtoebook, tdir, log=None, view_kepub=False, processed=False, only_input_plugin=False): from calibre.ebooks.conversion.plumber import Plumber, create_oebbook from calibre.utils.logging import default_log log = log or default_log plumber = Plumber(pathtoebook, tdir, log, view_kepub=view_kepub) plumber.setup_options() if pathtoebook.lower().endswith('.opf'): plumber.opts.dont_package = True if hasattr(plumber.opts, 'no_process'): plumber.opts.no_process = True plumber.input_plugin.for_viewer = True with plumber.input_plugin, open(plumber.input, 'rb') as inf: pathtoopf = plumber.input_plugin(inf, plumber.opts, plumber.input_fmt, log, {}, tdir) if not only_input_plugin: # Run the HTML preprocess/parsing from the conversion pipeline as # well if (processed or plumber.input_fmt.lower() in {'pdb', 'pdf', 'rb'} and not hasattr(pathtoopf, 'manifest')): if hasattr(pathtoopf, 'manifest'): pathtoopf = write_oebbook(pathtoopf, tdir) pathtoopf = create_oebbook(log, pathtoopf, plumber.opts) if hasattr(pathtoopf, 'manifest'): pathtoopf = write_oebbook(pathtoopf, tdir) book_format = os.path.splitext(pathtoebook)[1][1:].upper() if getattr(plumber.input_plugin, 'is_kf8', False): fs = ':joint' if getattr(plumber.input_plugin, 'mobi_is_joint', False) else '' book_format = 'KF8' + fs return book_format, pathtoopf, plumber.input_fmt
def _create_oebbook(self, hhcpath, basedir, opts, log, mi): import uuid from lxml import html from calibre.ebooks.conversion.plumber import create_oebbook from calibre.ebooks.oeb.base import DirContainer oeb = create_oebbook(log, None, opts, encoding=opts.input_encoding, populate=False) self.oeb = oeb metadata = oeb.metadata if mi.title: metadata.add('title', mi.title) if mi.authors: for a in mi.authors: metadata.add('creator', a, attrib={'role':'aut'}) if mi.publisher: metadata.add('publisher', mi.publisher) if mi.isbn: metadata.add('identifier', mi.isbn, attrib={'scheme':'ISBN'}) if not metadata.language: oeb.logger.warn(u'Language not specified') metadata.add('language', get_lang().replace('_', '-')) if not metadata.creator: oeb.logger.warn('Creator not specified') metadata.add('creator', _('Unknown')) if not metadata.title: oeb.logger.warn('Title not specified') metadata.add('title', _('Unknown')) bookid = str(uuid.uuid4()) metadata.add('identifier', bookid, id='uuid_id', scheme='uuid') for ident in metadata.identifier: if 'id' in ident.attrib: self.oeb.uid = metadata.identifier[0] break hhcdata = self._read_file(hhcpath) hhcroot = html.fromstring(hhcdata) chapters = self._process_nodes(hhcroot) #print "=============================" #print "Printing hhcroot" #print etree.tostring(hhcroot, pretty_print=True) #print "=============================" log.debug('Found %d section nodes' % len(chapters)) if len(chapters) > 0: path0 = chapters[0][1] subpath = os.path.dirname(path0) htmlpath = os.path.join(basedir, subpath) oeb.container = DirContainer(htmlpath, log) for chapter in chapters: title = chapter[0] basename = os.path.basename(chapter[1]) self._add_item(oeb, title, basename) oeb.container = DirContainer(htmlpath, oeb.log) return oeb
def do_rebuild(opf, dest_path): plumber = Plumber(opf, dest_path, default_log) plumber.setup_options() inp = plugin_for_input_format('azw3') outp = plugin_for_output_format('azw3') plumber.opts.mobi_passthrough = True oeb = create_oebbook(default_log, opf, plumber.opts) set_cover(oeb) outp.convert(oeb, dest_path, inp, plumber.opts, default_log)
def opf_to_azw3(opf, outpath, log): from calibre.ebooks.conversion.plumber import Plumber, create_oebbook plumber = Plumber(opf, outpath, log) plumber.setup_options() inp = plugin_for_input_format('azw3') outp = plugin_for_output_format('azw3') plumber.opts.mobi_passthrough = True oeb = create_oebbook(log, opf, plumber.opts) set_cover(oeb) outp.convert(oeb, outpath, inp, plumber.opts, log)
def commit(self, outpath=None, keep_parsed=False): super(AZW3Container, self).commit(keep_parsed=keep_parsed) if outpath is None: outpath = self.pathtoazw3 from calibre.ebooks.conversion.plumber import Plumber, create_oebbook opf = self.name_path_map[self.opf_name] plumber = Plumber(opf, outpath, self.log) plumber.setup_options() inp = plugin_for_input_format('azw3') outp = plugin_for_output_format('azw3') plumber.opts.mobi_passthrough = True oeb = create_oebbook(default_log, opf, plumber.opts) set_cover(oeb) outp.convert(oeb, outpath, inp, plumber.opts, default_log)
def convert(self, stream: IO, options, file_ext, log, accelerators): from calibre.ebooks.oeb.base import DirContainer from calibre.ebooks.conversion.plumber import create_oebbook log.debug("Parsing UMD file...") book = UMDFile.from_stream(stream) log.debug("Handle meta data ...") oeb = create_oebbook(log, None, options, encoding=options.input_encoding, populate=False) oeb.metadata.add('title', book.title) oeb.metadata.add('creator', book.author, attrib={'role': 'aut'}) oeb.metadata.add('publisher', book.publisher) oeb.metadata.add('identifier', str(uuid.uuid4()), id='uuid_id', scheme='uuid') for id_ in oeb.metadata.identifier: if 'id' in id_.attrib: oeb.uid = oeb.metadata.identifier[0] break with TemporaryDirectory('_umd2oeb', keep=True) as tmp_dir: log.debug('Process TOC ...') oeb.container = DirContainer(tmp_dir, log) content, cover = book.chapters, book.cover if content: for i, ch in enumerate(content): ch_title, ch_content = ch.title, ch.content if ch_title is None or ch_content is None: continue ch_content = ch_content.replace("\u2029", "") ch_fn = Path(tmp_dir) / f"ch_{i:04d}.html" ch_fn.write_text(convert_basic(ch_content, title=ch_title)) oeb.toc.add(ch_title, ch_fn.name) id_, href = oeb.manifest.generate(id='html', href=ch_fn.name) item = oeb.manifest.add(id_, href, 'text/html') item.html_input_href = ch_fn.name oeb.spine.add(item, True) if cover: cover_file = Path(tmp_dir) / "cover.jpeg" cover_file.write_bytes(cover) id_, href = oeb.manifest.generate(id='image', href=cover_file.name) oeb.guide.add('cover', 'Cover', href) return oeb
def opf_to_book(opf, outpath, container): from calibre.ebooks.conversion.plumber import Plumber, create_oebbook class Item(Manifest.Item): def _parse_css(self, data): # The default CSS parser used by oeb.base inserts the h namespace # and resolves all @import rules. We dont want that. return container.parse_css(data) def specialize(oeb): oeb.manifest.Item = Item plumber = Plumber(opf, outpath, container.log) plumber.setup_options() class Reader(OEBReader): def _metadata_from_opf(self, opf): for e in xpath(opf, 'o2:metadata//o2:meta'): if e.attrib.get('name') == 'original-resolution': comic_book_exth_values['original-resolution'] = e.attrib.get('content', '660x800') return OEBReader._metadata_from_opf(self, opf) oeb = create_oebbook(container.log, opf, plumber.opts, specialize=specialize, reader=Reader) fixup_metadata(oeb) set_cover_image(oeb) plumber.opts.dont_compress = True plumber.opts.toc_title = None plumber.opts.mobi_toc_at_start = False plumber.opts.no_inline_toc = True plumber.opts.mobi_periodical = False res = Resources(oeb, plumber.opts, False, process_images=False) if path.splitext(outpath)[1] != '.azw3': plumber.run() else: book = create_kf8_book(oeb, plumber.opts, res) book.opts.prefer_author_sort = False book.opts.share_not_sync = False print ('\nWriting out: {}\n'.format(outpath)) book.write(outpath)
def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.lit.reader import LitReader from calibre.ebooks.conversion.plumber import create_oebbook self.log = log return create_oebbook(log, stream, options, reader=LitReader)
def __enter__(self, processed=False, only_input_plugin=False, run_char_count=True, read_anchor_map=True, extract_embedded_fonts_for_qt=False): ''' Convert an ebook file into an exploded OEB book suitable for display in viewers/preprocessing etc. ''' from calibre.ebooks.conversion.plumber import Plumber, create_oebbook self.delete_on_exit = [] self._tdir = TemporaryDirectory('_ebook_iter') self.base = self._tdir.__enter__() plumber = Plumber(self.pathtoebook, self.base, self.log) plumber.setup_options() if self.pathtoebook.lower().endswith('.opf'): plumber.opts.dont_package = True if hasattr(plumber.opts, 'no_process'): plumber.opts.no_process = True plumber.input_plugin.for_viewer = True with plumber.input_plugin, open(plumber.input, 'rb') as inf: self.pathtoopf = plumber.input_plugin(inf, plumber.opts, plumber.input_fmt, self.log, {}, self.base) if not only_input_plugin: # Run the HTML preprocess/parsing from the conversion pipeline as # well if (processed or plumber.input_fmt.lower() in {'pdb', 'pdf', 'rb'} and not hasattr(self.pathtoopf, 'manifest')): if hasattr(self.pathtoopf, 'manifest'): self.pathtoopf = write_oebbook(self.pathtoopf, self.base) self.pathtoopf = create_oebbook(self.log, self.pathtoopf, plumber.opts) if hasattr(self.pathtoopf, 'manifest'): self.pathtoopf = write_oebbook(self.pathtoopf, self.base) self.book_format = os.path.splitext(self.pathtoebook)[1][1:].upper() if getattr(plumber.input_plugin, 'is_kf8', False): self.book_format = 'KF8' self.opf = getattr(plumber.input_plugin, 'optimize_opf_parsing', None) if self.opf is None: self.opf = OPF(self.pathtoopf, os.path.dirname(self.pathtoopf)) self.language = self.opf.language if self.language: self.language = self.language.lower() ordered = [i for i in self.opf.spine if i.is_linear] + \ [i for i in self.opf.spine if not i.is_linear] self.spine = [] Spiny = partial(SpineItem, read_anchor_map=read_anchor_map, run_char_count=run_char_count) is_comic = plumber.input_fmt.lower() in {'cbc', 'cbz', 'cbr', 'cb7'} for i in ordered: spath = i.path mt = None if i.idref is not None: mt = self.opf.manifest.type_for_id(i.idref) if mt is None: mt = guess_type(spath)[0] try: self.spine.append(Spiny(spath, mime_type=mt)) if is_comic: self.spine[-1].is_single_page = True except: self.log.warn('Missing spine item:', repr(spath)) cover = self.opf.cover if cover and self.ebook_ext in { 'lit', 'mobi', 'prc', 'opf', 'fb2', 'azw', 'azw3' }: cfile = os.path.join(self.base, 'calibre_iterator_cover.html') rcpath = os.path.relpath(cover, self.base).replace(os.sep, '/') chtml = (TITLEPAGE % prepare_string_for_xml(rcpath, True)).encode('utf-8') with open(cfile, 'wb') as f: f.write(chtml) self.spine[0:0] = [Spiny(cfile, mime_type='application/xhtml+xml')] self.delete_on_exit.append(cfile) if self.opf.path_to_html_toc is not None and \ self.opf.path_to_html_toc not in self.spine: try: self.spine.append(Spiny(self.opf.path_to_html_toc)) except: import traceback traceback.print_exc() sizes = [i.character_count for i in self.spine] self.pages = [ math.ceil(i / float(self.CHARACTERS_PER_PAGE)) for i in sizes ] for p, s in zip(self.pages, self.spine): s.pages = p start = 1 for s in self.spine: s.start_page = start start += s.pages s.max_page = s.start_page + s.pages - 1 self.toc = self.opf.toc if read_anchor_map: create_indexing_data(self.spine, self.toc) self.read_bookmarks() if extract_embedded_fonts_for_qt: from calibre.ebooks.oeb.iterator.extract_fonts import extract_fonts try: extract_fonts(self.opf, self.log) except: ol = self.log.filter_level self.log.filter_level = self.log.DEBUG self.log.exception('Failed to extract fonts') self.log.filter_level = ol return self
def _create_oebbook(self, hhcpath, basedir, opts, log, mi): import uuid from lxml import html from calibre.ebooks.conversion.plumber import create_oebbook from calibre.ebooks.oeb.base import DirContainer oeb = create_oebbook(log, None, opts, encoding=opts.input_encoding, populate=False) self.oeb = oeb metadata = oeb.metadata if mi.title: metadata.add('title', mi.title) if mi.authors: for a in mi.authors: metadata.add('creator', a, attrib={'role': 'aut'}) if mi.publisher: metadata.add('publisher', mi.publisher) if mi.isbn: metadata.add('identifier', mi.isbn, attrib={'scheme': 'ISBN'}) if not metadata.language: oeb.logger.warn(u'Language not specified') metadata.add('language', get_lang().replace('_', '-')) if not metadata.creator: oeb.logger.warn('Creator not specified') metadata.add('creator', _('Unknown')) if not metadata.title: oeb.logger.warn('Title not specified') metadata.add('title', _('Unknown')) bookid = str(uuid.uuid4()) metadata.add('identifier', bookid, id='uuid_id', scheme='uuid') for ident in metadata.identifier: if 'id' in ident.attrib: self.oeb.uid = metadata.identifier[0] break hhcdata = self._read_file(hhcpath) hhcroot = html.fromstring(hhcdata) chapters = self._process_nodes(hhcroot) #print "=============================" #print "Printing hhcroot" #print etree.tostring(hhcroot, pretty_print=True) #print "=============================" log.debug('Found %d section nodes' % len(chapters)) if len(chapters) > 0: path0 = chapters[0][1] subpath = os.path.dirname(path0) htmlpath = os.path.join(basedir, subpath) oeb.container = DirContainer(htmlpath, log) for chapter in chapters: title = chapter[0] basename = os.path.basename(chapter[1]) self._add_item(oeb, title, basename) oeb.container = DirContainer(htmlpath, oeb.log) return oeb
def __enter__(self, processed=False, only_input_plugin=False, run_char_count=True, read_anchor_map=True, view_kepub=False, read_links=True): ''' Convert an ebook file into an exploded OEB book suitable for display in viewers/preprocessing etc. ''' from calibre.ebooks.conversion.plumber import Plumber, create_oebbook self.delete_on_exit = [] self._tdir = TemporaryDirectory('_ebook_iter') self.base = self._tdir.__enter__() plumber = Plumber(self.pathtoebook, self.base, self.log, view_kepub=view_kepub) plumber.setup_options() if self.pathtoebook.lower().endswith('.opf'): plumber.opts.dont_package = True if hasattr(plumber.opts, 'no_process'): plumber.opts.no_process = True plumber.input_plugin.for_viewer = True with plumber.input_plugin, open(plumber.input, 'rb') as inf: self.pathtoopf = plumber.input_plugin(inf, plumber.opts, plumber.input_fmt, self.log, {}, self.base) if not only_input_plugin: # Run the HTML preprocess/parsing from the conversion pipeline as # well if (processed or plumber.input_fmt.lower() in {'pdb', 'pdf', 'rb'} and not hasattr(self.pathtoopf, 'manifest')): if hasattr(self.pathtoopf, 'manifest'): self.pathtoopf = write_oebbook(self.pathtoopf, self.base) self.pathtoopf = create_oebbook(self.log, self.pathtoopf, plumber.opts) if hasattr(self.pathtoopf, 'manifest'): self.pathtoopf = write_oebbook(self.pathtoopf, self.base) self.book_format = os.path.splitext(self.pathtoebook)[1][1:].upper() if getattr(plumber.input_plugin, 'is_kf8', False): fs = ':joint' if getattr(plumber.input_plugin, 'mobi_is_joint', False) else '' self.book_format = 'KF8' + fs self.opf = getattr(plumber.input_plugin, 'optimize_opf_parsing', None) if self.opf is None: self.opf = OPF(self.pathtoopf, os.path.dirname(self.pathtoopf)) self.language = self.opf.language if self.language: self.language = self.language.lower() ordered = [i for i in self.opf.spine if i.is_linear] + \ [i for i in self.opf.spine if not i.is_linear] self.spine = [] Spiny = partial(SpineItem, read_anchor_map=read_anchor_map, read_links=read_links, run_char_count=run_char_count, from_epub=self.book_format == 'EPUB') is_comic = plumber.input_fmt.lower() in {'cbc', 'cbz', 'cbr', 'cb7'} for i in ordered: spath = i.path mt = None if i.idref is not None: mt = self.opf.manifest.type_for_id(i.idref) if mt is None: mt = guess_type(spath)[0] try: self.spine.append(Spiny(spath, mime_type=mt)) if is_comic: self.spine[-1].is_single_page = True except: self.log.warn('Missing spine item:', repr(spath)) cover = self.opf.cover if cover and self.ebook_ext in {'lit', 'mobi', 'prc', 'opf', 'fb2', 'azw', 'azw3', 'docx', 'htmlz'}: cfile = os.path.join(self.base, 'calibre_iterator_cover.html') rcpath = os.path.relpath(cover, self.base).replace(os.sep, '/') chtml = (TITLEPAGE%prepare_string_for_xml(rcpath, True)).encode('utf-8') with open(cfile, 'wb') as f: f.write(chtml) self.spine[0:0] = [Spiny(cfile, mime_type='application/xhtml+xml')] self.delete_on_exit.append(cfile) if self.opf.path_to_html_toc is not None and \ self.opf.path_to_html_toc not in self.spine: try: self.spine.append(Spiny(self.opf.path_to_html_toc)) except: import traceback traceback.print_exc() sizes = [i.character_count for i in self.spine] self.pages = [math.ceil(i/float(self.CHARACTERS_PER_PAGE)) for i in sizes] for p, s in zip(self.pages, self.spine): s.pages = p start = 1 for s in self.spine: s.start_page = start start += s.pages s.max_page = s.start_page + s.pages - 1 self.toc = self.opf.toc if read_anchor_map: create_indexing_data(self.spine, self.toc) self.verify_links() self.read_bookmarks() return self
def create_oebbook(self, htmlpath, basedir, opts, log, mi): import uuid from calibre.ebooks.conversion.plumber import create_oebbook from calibre.ebooks.oeb.base import ( DirContainer, rewrite_links, urlnormalize, urldefrag, BINARY_MIME, OEB_STYLES, xpath, ) from calibre import guess_type from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata from calibre.ebooks.html.input import get_filelist import cssutils, logging cssutils.log.setLevel(logging.WARN) self.OEB_STYLES = OEB_STYLES oeb = create_oebbook(log, None, opts, self, encoding=opts.input_encoding, populate=False) self.oeb = oeb metadata = oeb.metadata meta_info_to_oeb_metadata(mi, metadata, log) if not metadata.language: oeb.logger.warn("Language not specified") metadata.add("language", get_lang().replace("_", "-")) if not metadata.creator: oeb.logger.warn("Creator not specified") metadata.add("creator", self.oeb.translate(__("Unknown"))) if not metadata.title: oeb.logger.warn("Title not specified") metadata.add("title", self.oeb.translate(__("Unknown"))) bookid = str(uuid.uuid4()) metadata.add("identifier", bookid, id="uuid_id", scheme="uuid") for ident in metadata.identifier: if "id" in ident.attrib: self.oeb.uid = metadata.identifier[0] break filelist = get_filelist(htmlpath, basedir, opts, log) filelist = [f for f in filelist if not f.is_binary] htmlfile_map = {} for f in filelist: path = f.path oeb.container = DirContainer(os.path.dirname(path), log, ignore_opf=True) bname = os.path.basename(path) id, href = oeb.manifest.generate(id="html", href=ascii_filename(bname)) htmlfile_map[path] = href item = oeb.manifest.add(id, href, "text/html") item.html_input_href = bname oeb.spine.add(item, True) self.added_resources = {} self.log = log self.log("Normalizing filename cases") for path, href in htmlfile_map.items(): if not self.is_case_sensitive(path): path = path.lower() self.added_resources[path] = href self.urlnormalize, self.DirContainer = urlnormalize, DirContainer self.urldefrag = urldefrag self.guess_type, self.BINARY_MIME = guess_type, BINARY_MIME self.log("Rewriting HTML links") for f in filelist: path = f.path dpath = os.path.dirname(path) oeb.container = DirContainer(dpath, log, ignore_opf=True) item = oeb.manifest.hrefs[htmlfile_map[path]] rewrite_links(item.data, partial(self.resource_adder, base=dpath)) for item in oeb.manifest.values(): if item.media_type in self.OEB_STYLES: dpath = None for path, href in self.added_resources.items(): if href == item.href: dpath = os.path.dirname(path) break cssutils.replaceUrls(item.data, partial(self.resource_adder, base=dpath)) toc = self.oeb.toc self.oeb.auto_generated_toc = True titles = [] headers = [] for item in self.oeb.spine: if not item.linear: continue html = item.data title = "".join(xpath(html, "/h:html/h:head/h:title/text()")) title = re.sub(r"\s+", " ", title.strip()) if title: titles.append(title) headers.append("(unlabled)") for tag in ("h1", "h2", "h3", "h4", "h5", "strong"): expr = "/h:html/h:body//h:%s[position()=1]/text()" header = "".join(xpath(html, expr % tag)) header = re.sub(r"\s+", " ", header.strip()) if header: headers[-1] = header break use = titles if len(titles) > len(set(titles)): use = headers for title, item in izip(use, self.oeb.spine): if not item.linear: continue toc.add(title, item.href) oeb.container = DirContainer(os.getcwdu(), oeb.log, ignore_opf=True) return oeb
def create_oebbook(self, htmlpath, basedir, opts, log, mi): import uuid from calibre.ebooks.conversion.plumber import create_oebbook from calibre.ebooks.oeb.base import (DirContainer, rewrite_links, urlnormalize, urldefrag, BINARY_MIME, OEB_STYLES, xpath, urlquote) from calibre import guess_type from calibre.ebooks.oeb.transforms.metadata import \ meta_info_to_oeb_metadata from calibre.ebooks.html.input import get_filelist from calibre.ebooks.metadata import string_to_authors from calibre.utils.localization import canonicalize_lang import css_parser, logging css_parser.log.setLevel(logging.WARN) self.OEB_STYLES = OEB_STYLES oeb = create_oebbook(log, None, opts, self, encoding=opts.input_encoding, populate=False) self.oeb = oeb metadata = oeb.metadata meta_info_to_oeb_metadata(mi, metadata, log) if not metadata.language: l = canonicalize_lang(getattr(opts, 'language', None)) if not l: oeb.logger.warn('Language not specified') l = get_lang().replace('_', '-') metadata.add('language', l) if not metadata.creator: a = getattr(opts, 'authors', None) if a: a = string_to_authors(a) if not a: oeb.logger.warn('Creator not specified') a = [self.oeb.translate(__('Unknown'))] for aut in a: metadata.add('creator', aut) if not metadata.title: oeb.logger.warn('Title not specified') metadata.add('title', self.oeb.translate(__('Unknown'))) bookid = unicode_type(uuid.uuid4()) metadata.add('identifier', bookid, id='uuid_id', scheme='uuid') for ident in metadata.identifier: if 'id' in ident.attrib: self.oeb.uid = metadata.identifier[0] break filelist = get_filelist(htmlpath, basedir, opts, log) filelist = [f for f in filelist if not f.is_binary] htmlfile_map = {} for f in filelist: path = f.path oeb.container = DirContainer(os.path.dirname(path), log, ignore_opf=True) bname = os.path.basename(path) id, href = oeb.manifest.generate(id='html', href=sanitize_file_name(bname)) htmlfile_map[path] = href item = oeb.manifest.add(id, href, 'text/html') if path == htmlpath and '%' in path: bname = urlquote(bname) item.html_input_href = bname oeb.spine.add(item, True) self.added_resources = {} self.log = log self.log('Normalizing filename cases') for path, href in htmlfile_map.items(): if not self.is_case_sensitive(path): path = path.lower() self.added_resources[path] = href self.urlnormalize, self.DirContainer = urlnormalize, DirContainer self.urldefrag = urldefrag self.guess_type, self.BINARY_MIME = guess_type, BINARY_MIME self.log('Rewriting HTML links') for f in filelist: path = f.path dpath = os.path.dirname(path) oeb.container = DirContainer(dpath, log, ignore_opf=True) href = htmlfile_map[path] try: item = oeb.manifest.hrefs[href] except KeyError: item = oeb.manifest.hrefs[urlnormalize(href)] rewrite_links(item.data, partial(self.resource_adder, base=dpath)) for item in oeb.manifest.values(): if item.media_type in self.OEB_STYLES: dpath = None for path, href in self.added_resources.items(): if href == item.href: dpath = os.path.dirname(path) break css_parser.replaceUrls( item.data, partial(self.resource_adder, base=dpath)) toc = self.oeb.toc self.oeb.auto_generated_toc = True titles = [] headers = [] for item in self.oeb.spine: if not item.linear: continue html = item.data title = ''.join(xpath(html, '/h:html/h:head/h:title/text()')) title = re.sub(r'\s+', ' ', title.strip()) if title: titles.append(title) headers.append('(unlabled)') for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'): expr = '/h:html/h:body//h:%s[position()=1]/text()' header = ''.join(xpath(html, expr % tag)) header = re.sub(r'\s+', ' ', header.strip()) if header: headers[-1] = header break use = titles if len(titles) > len(set(titles)): use = headers for title, item in zip(use, self.oeb.spine): if not item.linear: continue toc.add(title, item.href) oeb.container = DirContainer(getcwd(), oeb.log, ignore_opf=True) return oeb
def convert(self, stream, options, file_ext, log, accelerators): import uuid from calibre.ebooks.oeb.base import DirContainer from calibre.ebooks.snb.snbfile import SNBFile from calibre.utils.xml_parse import safe_xml_fromstring log.debug("Parsing SNB file...") snbFile = SNBFile() try: snbFile.Parse(stream) except: raise ValueError("Invalid SNB file") if not snbFile.IsValid(): log.debug("Invalid SNB file") raise ValueError("Invalid SNB file") log.debug("Handle meta data ...") from calibre.ebooks.conversion.plumber import create_oebbook oeb = create_oebbook(log, None, options, encoding=options.input_encoding, populate=False) meta = snbFile.GetFileStream('snbf/book.snbf') if meta is not None: meta = safe_xml_fromstring(meta) l = { 'title': './/head/name', 'creator': './/head/author', 'language': './/head/language', 'generator': './/head/generator', 'publisher': './/head/publisher', 'cover': './/head/cover', } d = {} for item in l: node = meta.find(l[item]) if node is not None: d[item] = node.text if node.text is not None else '' else: d[item] = '' oeb.metadata.add('title', d['title']) oeb.metadata.add('creator', d['creator'], attrib={'role': 'aut'}) oeb.metadata.add('language', d['language'].lower().replace('_', '-')) oeb.metadata.add('generator', d['generator']) oeb.metadata.add('publisher', d['publisher']) if d['cover'] != '': oeb.guide.add('cover', 'Cover', d['cover']) bookid = unicode_type(uuid.uuid4()) oeb.metadata.add('identifier', bookid, id='uuid_id', scheme='uuid') for ident in oeb.metadata.identifier: if 'id' in ident.attrib: oeb.uid = oeb.metadata.identifier[0] break with TemporaryDirectory('_snb2oeb', keep=True) as tdir: log.debug('Process TOC ...') toc = snbFile.GetFileStream('snbf/toc.snbf') oeb.container = DirContainer(tdir, log) if toc is not None: toc = safe_xml_fromstring(toc) i = 1 for ch in toc.find('.//body'): chapterName = ch.text chapterSrc = ch.get('src') fname = 'ch_%d.htm' % i data = snbFile.GetFileStream('snbc/' + chapterSrc) if data is None: continue snbc = safe_xml_fromstring(data) lines = [] for line in snbc.find('.//body'): if line.tag == 'text': lines.append('<p>%s</p>' % html_encode(line.text)) elif line.tag == 'img': lines.append('<p><img src="%s" /></p>' % html_encode(line.text)) with open(os.path.join(tdir, fname), 'wb') as f: f.write((HTML_TEMPLATE % (chapterName, '\n'.join(lines))).encode( 'utf-8', 'replace')) oeb.toc.add(ch.text, fname) id, href = oeb.manifest.generate( id='html', href=ascii_filename(fname)) item = oeb.manifest.add(id, href, 'text/html') item.html_input_href = fname oeb.spine.add(item, True) i = i + 1 imageFiles = snbFile.OutputImageFiles(tdir) for f, m in imageFiles: id, href = oeb.manifest.generate(id='image', href=ascii_filename(f)) item = oeb.manifest.add(id, href, m) item.html_input_href = f return oeb
def convert(self, stream, options, file_ext, log, accelerators): import uuid from lxml import etree from calibre.ebooks.oeb.base import DirContainer from calibre.ebooks.snb.snbfile import SNBFile log.debug("Parsing SNB file...") snbFile = SNBFile() try: snbFile.Parse(stream) except: raise ValueError("Invalid SNB file") if not snbFile.IsValid(): log.debug("Invaild SNB file") raise ValueError("Invalid SNB file") log.debug("Handle meta data ...") from calibre.ebooks.conversion.plumber import create_oebbook oeb = create_oebbook(log, None, options, encoding=options.input_encoding, populate=False) meta = snbFile.GetFileStream('snbf/book.snbf') if meta is not None: meta = etree.fromstring(meta) l = {'title' : './/head/name', 'creator' : './/head/author', 'language' : './/head/language', 'generator': './/head/generator', 'publisher': './/head/publisher', 'cover' : './/head/cover', } d = {} for item in l: node = meta.find(l[item]) if node is not None: d[item] = node.text if node.text is not None else '' else: d[item] = '' oeb.metadata.add('title', d['title']) oeb.metadata.add('creator', d['creator'], attrib={'role':'aut'}) oeb.metadata.add('language', d['language'].lower().replace('_', '-')) oeb.metadata.add('generator', d['generator']) oeb.metadata.add('publisher', d['publisher']) if d['cover'] != '': oeb.guide.add('cover', 'Cover', d['cover']) bookid = str(uuid.uuid4()) oeb.metadata.add('identifier', bookid, id='uuid_id', scheme='uuid') for ident in oeb.metadata.identifier: if 'id' in ident.attrib: oeb.uid = oeb.metadata.identifier[0] break with TemporaryDirectory('_snb2oeb', keep=True) as tdir: log.debug('Process TOC ...') toc = snbFile.GetFileStream('snbf/toc.snbf') oeb.container = DirContainer(tdir, log) if toc is not None: toc = etree.fromstring(toc) i = 1 for ch in toc.find('.//body'): chapterName = ch.text chapterSrc = ch.get('src') fname = 'ch_%d.htm' % i data = snbFile.GetFileStream('snbc/' + chapterSrc) if data is None: continue snbc = etree.fromstring(data) outputFile = open(os.path.join(tdir, fname), 'wb') lines = [] for line in snbc.find('.//body'): if line.tag == 'text': lines.append(u'<p>%s</p>' % html_encode(line.text)) elif line.tag == 'img': lines.append(u'<p><img src="%s" /></p>' % html_encode(line.text)) outputFile.write((HTML_TEMPLATE % (chapterName, u'\n'.join(lines))).encode('utf-8', 'replace')) outputFile.close() oeb.toc.add(ch.text, fname) id, href = oeb.manifest.generate(id='html', href=ascii_filename(fname)) item = oeb.manifest.add(id, href, 'text/html') item.html_input_href = fname oeb.spine.add(item, True) i = i + 1 imageFiles = snbFile.OutputImageFiles(tdir) for f, m in imageFiles: id, href = oeb.manifest.generate(id='image', href=ascii_filename(f)) item = oeb.manifest.add(id, href, m) item.html_input_href = f return oeb
def create_oebbook(self, htmlpath, basedir, opts, log, mi): import uuid from calibre.ebooks.conversion.plumber import create_oebbook from calibre.ebooks.oeb.base import (DirContainer, rewrite_links, urlnormalize, urldefrag, BINARY_MIME, OEB_STYLES, xpath) from calibre import guess_type from calibre.ebooks.oeb.transforms.metadata import \ meta_info_to_oeb_metadata from calibre.ebooks.html.input import get_filelist from calibre.ebooks.metadata import string_to_authors from calibre.utils.localization import canonicalize_lang import cssutils, logging cssutils.log.setLevel(logging.WARN) self.OEB_STYLES = OEB_STYLES oeb = create_oebbook(log, None, opts, self, encoding=opts.input_encoding, populate=False) self.oeb = oeb metadata = oeb.metadata meta_info_to_oeb_metadata(mi, metadata, log) if not metadata.language: l = canonicalize_lang(getattr(opts, 'language', None)) if not l: oeb.logger.warn(u'Language not specified') l = get_lang().replace('_', '-') metadata.add('language', l) if not metadata.creator: a = getattr(opts, 'authors', None) if a: a = string_to_authors(a) if not a: oeb.logger.warn('Creator not specified') a = [self.oeb.translate(__('Unknown'))] for aut in a: metadata.add('creator', aut) if not metadata.title: oeb.logger.warn('Title not specified') metadata.add('title', self.oeb.translate(__('Unknown'))) bookid = str(uuid.uuid4()) metadata.add('identifier', bookid, id='uuid_id', scheme='uuid') for ident in metadata.identifier: if 'id' in ident.attrib: self.oeb.uid = metadata.identifier[0] break filelist = get_filelist(htmlpath, basedir, opts, log) filelist = [f for f in filelist if not f.is_binary] htmlfile_map = {} for f in filelist: path = f.path oeb.container = DirContainer(os.path.dirname(path), log, ignore_opf=True) bname = os.path.basename(path) id, href = oeb.manifest.generate(id='html', href=ascii_filename(bname)) htmlfile_map[path] = href item = oeb.manifest.add(id, href, 'text/html') item.html_input_href = bname oeb.spine.add(item, True) self.added_resources = {} self.log = log self.log('Normalizing filename cases') for path, href in htmlfile_map.items(): if not self.is_case_sensitive(path): path = path.lower() self.added_resources[path] = href self.urlnormalize, self.DirContainer = urlnormalize, DirContainer self.urldefrag = urldefrag self.guess_type, self.BINARY_MIME = guess_type, BINARY_MIME self.log('Rewriting HTML links') for f in filelist: path = f.path dpath = os.path.dirname(path) oeb.container = DirContainer(dpath, log, ignore_opf=True) item = oeb.manifest.hrefs[htmlfile_map[path]] rewrite_links(item.data, partial(self.resource_adder, base=dpath)) for item in oeb.manifest.values(): if item.media_type in self.OEB_STYLES: dpath = None for path, href in self.added_resources.items(): if href == item.href: dpath = os.path.dirname(path) break cssutils.replaceUrls(item.data, partial(self.resource_adder, base=dpath)) toc = self.oeb.toc self.oeb.auto_generated_toc = True titles = [] headers = [] for item in self.oeb.spine: if not item.linear: continue html = item.data title = ''.join(xpath(html, '/h:html/h:head/h:title/text()')) title = re.sub(r'\s+', ' ', title.strip()) if title: titles.append(title) headers.append('(unlabled)') for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'): expr = '/h:html/h:body//h:%s[position()=1]/text()' header = ''.join(xpath(html, expr % tag)) header = re.sub(r'\s+', ' ', header.strip()) if header: headers[-1] = header break use = titles if len(titles) > len(set(titles)): use = headers for title, item in izip(use, self.oeb.spine): if not item.linear: continue toc.add(title, item.href) oeb.container = DirContainer(os.getcwdu(), oeb.log, ignore_opf=True) return oeb
def convert(self, stream, options, file_ext, log, accelerators): import uuid from lxml import etree from calibre.ebooks.oeb.base import DirContainer from calibre.ebooks.snb.snbfile import SNBFile log.debug("Parsing SNB file...") snbFile = SNBFile() try: snbFile.Parse(stream) except: raise ValueError("Invalid SNB file") if not snbFile.IsValid(): log.debug("Invaild SNB file") raise ValueError("Invalid SNB file") log.debug("Handle meta data ...") from calibre.ebooks.conversion.plumber import create_oebbook oeb = create_oebbook(log, None, options, encoding=options.input_encoding, populate=False) meta = snbFile.GetFileStream("snbf/book.snbf") if meta != None: meta = etree.fromstring(meta) l = { "title": ".//head/name", "creator": ".//head/author", "language": ".//head/language", "generator": ".//head/generator", "publisher": ".//head/publisher", "cover": ".//head/cover", } d = {} for item in l: node = meta.find(l[item]) if node != None: d[item] = node.text if node.text != None else "" else: d[item] = "" oeb.metadata.add("title", d["title"]) oeb.metadata.add("creator", d["creator"], attrib={"role": "aut"}) oeb.metadata.add("language", d["language"].lower().replace("_", "-")) oeb.metadata.add("generator", d["generator"]) oeb.metadata.add("publisher", d["publisher"]) if d["cover"] != "": oeb.guide.add("cover", "Cover", d["cover"]) bookid = str(uuid.uuid4()) oeb.metadata.add("identifier", bookid, id="uuid_id", scheme="uuid") for ident in oeb.metadata.identifier: if "id" in ident.attrib: oeb.uid = oeb.metadata.identifier[0] break with TemporaryDirectory("_snb2oeb", keep=True) as tdir: log.debug("Process TOC ...") toc = snbFile.GetFileStream("snbf/toc.snbf") oeb.container = DirContainer(tdir, log) if toc != None: toc = etree.fromstring(toc) i = 1 for ch in toc.find(".//body"): chapterName = ch.text chapterSrc = ch.get("src") fname = "ch_%d.htm" % i data = snbFile.GetFileStream("snbc/" + chapterSrc) if data == None: continue snbc = etree.fromstring(data) outputFile = open(os.path.join(tdir, fname), "wb") lines = [] for line in snbc.find(".//body"): if line.tag == "text": lines.append(u"<p>%s</p>" % html_encode(line.text)) elif line.tag == "img": lines.append(u'<p><img src="%s" /></p>' % html_encode(line.text)) outputFile.write((HTML_TEMPLATE % (chapterName, u"\n".join(lines))).encode("utf-8", "replace")) outputFile.close() oeb.toc.add(ch.text, fname) id, href = oeb.manifest.generate(id="html", href=ascii_filename(fname)) item = oeb.manifest.add(id, href, "text/html") item.html_input_href = fname oeb.spine.add(item, True) i = i + 1 imageFiles = snbFile.OutputImageFiles(tdir) for f, m in imageFiles: id, href = oeb.manifest.generate(id="image", href=ascii_filename(f)) item = oeb.manifest.add(id, href, m) item.html_input_href = f return oeb