def check_link_destinations(container): " Check destinations of links that point to HTML files " errors = [] dest_map = {} opf_type = guess_type("a.opf") ncx_type = guess_type("a.ncx") for name, mt in container.mime_map.iteritems(): if mt in OEB_DOCS: for a in container.parsed(name).xpath('//*[local-name()="a" and @href]'): href = a.get("href") check_link_destination(container, dest_map, name, href, a, errors) elif mt == opf_type: for a in container.opf_xpath("//opf:reference[@href]"): if container.book_type == "azw3" and a.get("type") in { "cover", "other.ms-coverimage-standard", "other.ms-coverimage", }: continue href = a.get("href") check_link_destination(container, dest_map, name, href, a, errors) elif mt == ncx_type: for a in container.parsed(name).xpath('//*[local-name() = "content" and @src]'): href = a.get("src") check_link_destination(container, dest_map, name, href, a, errors) return errors
def help_url(item, item_type, doc_name, extra_data=None): url = None url_maps = () item = item.lower() if item_type == 'css_property': url_maps = ('css',) else: mt = guess_type(doc_name) if mt in OEB_DOCS: url_maps = ('html', 'svg', 'mathml') elif mt == guess_type('a.svg'): url_maps = ('svg',) elif mt == guess_type('a.opf'): version = '3' if getattr(extra_data, 'startswith', lambda x: False)('3') else '2' url_maps = (('opf' + version),) elif mt == guess_type('a.svg'): url_maps = ('svg',) elif mt == guess_type('a.ncx'): url_maps = ('opf2',) for umap in url_maps: umap = _url_map[umap] if item in umap: url = umap[item] break item = item.partition(':')[-1] if item and item in umap: url = umap[item] break return url
def iterlinks(self, name, get_line_numbers=True): ''' Iterate over all links in name. If get_line_numbers is True the yields results of the form (link, line_number, offset). Where line_number is the line_number at which the link occurs and offset is the number of characters from the start of the line. Note that offset could actually encompass several lines if not zero. ''' media_type = self.mime_map.get(name, guess_type(name)) if name == self.opf_name: for elem in self.opf_xpath('//*[@href]'): yield (elem.get('href'), elem.sourceline, 0) if get_line_numbers else elem.get('href') elif media_type.lower() in OEB_DOCS: for el, attr, link, pos in iterlinks(self.parsed(name)): yield (link, el.sourceline, pos) if get_line_numbers else link elif media_type.lower() in OEB_STYLES: if get_line_numbers: with self.open(name, 'rb') as f: raw = self.decode(f.read()).replace('\r\n', '\n').replace('\r', '\n') position = PositionFinder(raw) is_in_comment = CommentFinder(raw) for link, offset in itercsslinks(raw): if not is_in_comment(offset): lnum, col = position(offset) yield link, lnum, col else: for link in getUrls(self.parsed(name)): yield link elif media_type.lower() == guess_type('toc.ncx'): for elem in self.parsed(name).xpath('//*[@src]'): yield (elem.get('src'), elem.sourceline, 0) if get_line_numbers else elem.get('src')
def create_book(mi, path, fmt='epub', opf_name='metadata.opf', html_name='start.xhtml', toc_name='toc.ncx'): ''' Create an empty book in the specified format at the specified location. ''' path = os.path.abspath(path) lang = 'und' opf = metadata_to_opf(mi, as_string=False) for l in opf.xpath('//*[local-name()="language"]'): if l.text: lang = l.text break lang = lang_as_iso639_1(lang) or lang opfns = OPF_NAMESPACES['opf'] m = opf.makeelement('{%s}manifest' % opfns) opf.insert(1, m) i = m.makeelement('{%s}item' % opfns, href=html_name, id='start') i.set('media-type', guess_type('a.xhtml')) m.append(i) i = m.makeelement('{%s}item' % opfns, href=toc_name, id='ncx') i.set('media-type', guess_type(toc_name)) m.append(i) s = opf.makeelement('{%s}spine' % opfns, toc="ncx") opf.insert(2, s) i = s.makeelement('{%s}itemref' % opfns, idref='start') s.append(i) CONTAINER = '''\ <?xml version="1.0"?> <container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container"> <rootfiles> <rootfile full-path="{0}" media-type="application/oebps-package+xml"/> </rootfiles> </container> '''.format(prepare_string_for_xml(opf_name, True)).encode('utf-8') HTML = P('templates/new_book.html', data=True).decode('utf-8').replace( '_LANGUAGE_', prepare_string_for_xml(lang, True) ).replace( '_TITLE_', prepare_string_for_xml(mi.title) ).replace( '_AUTHORS_', prepare_string_for_xml(authors_to_string(mi.authors)) ).encode('utf-8') h = parse(HTML) pretty_html_tree(None, h) HTML = serialize(h, 'text/html') ncx = etree.tostring(create_toc(mi, opf, html_name, lang), encoding='utf-8', xml_declaration=True, pretty_print=True) pretty_xml_tree(opf) opf = etree.tostring(opf, encoding='utf-8', xml_declaration=True, pretty_print=True) if fmt == 'azw3': with TemporaryDirectory('create-azw3') as tdir, CurrentDir(tdir): for name, data in ((opf_name, opf), (html_name, HTML), (toc_name, ncx)): with open(name, 'wb') as f: f.write(data) c = Container(os.path.dirname(os.path.abspath(opf_name)), opf_name, DevNull()) opf_to_azw3(opf_name, path, c) else: with ZipFile(path, 'w', compression=ZIP_STORED) as zf: zf.writestr('mimetype', b'application/epub+zip', compression=ZIP_STORED) zf.writestr('META-INF/', b'', 0755) zf.writestr('META-INF/container.xml', CONTAINER) zf.writestr(opf_name, opf) zf.writestr(html_name, HTML) zf.writestr(toc_name, ncx)
def create_epub(manifest, spine=(), guide=(), meta_cover=None, ver=3): mo = [] for name, data, properties in manifest: mo.append('<item id="%s" href="%s" media-type="%s" %s/>' % ( name, name, guess_type(name), ('properties="%s"' % properties if properties else ''))) mo = ''.join(mo) metadata = '' if meta_cover: metadata = '<meta name="cover" content="%s"/>' % meta_cover if not spine: spine = [x[0] for x in manifest if guess_type(x[0]) in OEB_DOCS] spine = ''.join('<itemref idref="%s"/>' % name for name in spine) guide = ''.join('<reference href="%s" type="%s"/>' % (name, typ) for name, typ in guide) opf = OPF_TEMPLATE.format(manifest=mo, ver='%d.0'%ver, metadata=metadata, spine=spine, guide=guide) buf = BytesIO() with ZipFile(buf, 'w', ZIP_STORED) as zf: zf.writestr('META-INF/container.xml', b''' <container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container"> <rootfiles> <rootfile full-path="content.opf" media-type="application/oebps-package+xml"/> </rootfiles> </container>''') zf.writestr('content.opf', opf.encode('utf-8')) for name, data, properties in manifest: if isinstance(data, type('')): data = data.encode('utf-8') zf.writestr(name, data) buf.seek(0) return buf
def check_links(container): links_map = defaultdict(set) xml_types = {guess_type('a.opf'), guess_type('a.ncx')} errors = [] a = errors.append def fl(x): x = repr(x) if x.startswith('u'): x = x[1:] return x for name, mt in container.mime_map.iteritems(): if mt in OEB_DOCS or mt in OEB_STYLES or mt in xml_types: for href, lnum, col in container.iterlinks(name): tname = container.href_to_name(href, name) if tname is not None: if container.exists(tname): links_map[name].add(tname) else: a(BadLink(_('The linked resource %s does not exist') % fl(href), name, lnum, col)) else: purl = urlparse(href) if purl.scheme == 'file': a(FileLink(_('The link %s is a file:// URL') % fl(href), name, lnum, col)) elif purl.path and purl.path.startswith('/') and purl.scheme in {'', 'file'}: a(LocalLink(_('The link %s points to a file outside the book') % fl(href), name, lnum, col)) spine_docs = {name for name, linear in container.spine_names} spine_styles = {tname for name in spine_docs for tname in links_map[name] if container.mime_map[tname] in OEB_STYLES} num = -1 while len(spine_styles) > num: # Handle import rules in stylesheets num = len(spine_styles) spine_styles |= {tname for name in spine_styles for tname in links_map[name] if container.mime_map[tname] in OEB_STYLES} seen = set(OEB_DOCS) | set(OEB_STYLES) spine_resources = {tname for name in spine_docs | spine_styles for tname in links_map[name] if container.mime_map[tname] not in seen} unreferenced = set() cover_name = container.guide_type_map.get('cover', None) for name, mt in container.mime_map.iteritems(): if mt in OEB_STYLES and name not in spine_styles: a(UnreferencedResource(name)) elif mt in OEB_DOCS and name not in spine_docs: a(UnreferencedDoc(name)) elif (mt in OEB_FONTS or mt.partition('/')[0] in {'image', 'audio', 'video'}) and name not in spine_resources and name != cover_name: a(UnreferencedResource(name)) else: continue unreferenced.add(name) manifest_names = set(container.manifest_id_map.itervalues()) for name in container.mime_map: if name not in container.names_that_need_not_be_manifested and name not in manifest_names: a(Unmanifested(name)) return errors
def __init__(self, rootpath, opfpath, log, clone_data=None): self.root = clone_data['root'] if clone_data is not None else os.path.abspath(rootpath) self.log = log self.html_preprocessor = HTMLPreProcessor() self.css_preprocessor = CSSPreProcessor() self.tweak_mode = False self.parsed_cache = {} self.mime_map = {} self.name_path_map = {} self.dirtied = set() self.encoding_map = {} self.pretty_print = set() self.cloned = False self.cache_names = ('parsed_cache', 'mime_map', 'name_path_map', 'encoding_map', 'dirtied', 'pretty_print') if clone_data is not None: self.cloned = True for x in ('name_path_map', 'opf_name', 'mime_map', 'pretty_print', 'encoding_map', 'tweak_mode'): setattr(self, x, clone_data[x]) self.opf_dir = os.path.dirname(self.name_path_map[self.opf_name]) return # Map of relative paths with '/' separators from root of unzipped ePub # to absolute paths on filesystem with os-specific separators opfpath = os.path.abspath(os.path.realpath(opfpath)) for dirpath, _dirnames, filenames in os.walk(self.root): for f in filenames: path = join(dirpath, f) name = self.abspath_to_name(path) if isosx: # OS X silently changes all file names to NFD form. The # EPUB spec requires all text including filenames to be in # NFC form. The proper fix is to implement a VFS that maps # between canonical names and their filesystem # representation, however, I dont have the time for that # now, so this will at least fix the problem for books that # properly use the NFC form. Books that use the NFD form # will be broken by this, but that's the price you pay for # using OS X. name = unicodedata.normalize('NFC', name) self.name_path_map[name] = path self.mime_map[name] = guess_type(path) # Special case if we have stumbled onto the opf if path == opfpath: self.opf_name = name self.opf_dir = os.path.dirname(path) self.mime_map[name] = guess_type('a.opf') if not hasattr(self, 'opf_name'): raise InvalidBook('Could not locate opf file: %r'%opfpath) # Update mime map with data from the OPF self.refresh_mime_map()
def __init__(self, rootpath, opfpath, log, clone_data=None): self.root = clone_data['root'] if clone_data is not None else os.path.abspath(rootpath) self.log = log self.html_preprocessor = HTMLPreProcessor() self.css_preprocessor = CSSPreProcessor() self.tweak_mode = False self.parsed_cache = {} self.mime_map = {} self.name_path_map = {} self.dirtied = set() self.encoding_map = {} self.pretty_print = set() self.cloned = False self.cache_names = ('parsed_cache', 'mime_map', 'name_path_map', 'encoding_map', 'dirtied', 'pretty_print') if clone_data is not None: self.cloned = True for x in ('name_path_map', 'opf_name', 'mime_map', 'pretty_print', 'encoding_map', 'tweak_mode'): setattr(self, x, clone_data[x]) self.opf_dir = os.path.dirname(self.name_path_map[self.opf_name]) return # Map of relative paths with '/' separators from root of unzipped ePub # to absolute paths on filesystem with os-specific separators opfpath = os.path.abspath(os.path.realpath(opfpath)) for dirpath, _dirnames, filenames in os.walk(self.root): for f in filenames: path = join(dirpath, f) name = self.abspath_to_name(path) # OS X silently changes all file names to NFD form. The EPUB # spec requires all text including filenames to be in NFC form. # The proper fix is to implement a VFS that maps between # canonical names and their file system representation, however, # I dont have the time for that now. Note that the container # ensures that all text files are normalized to NFC when # decoding them anyway, so there should be no mismatch between # names in the text and NFC canonical file names. name = unicodedata.normalize('NFC', name) self.name_path_map[name] = path self.mime_map[name] = guess_type(path) # Special case if we have stumbled onto the opf if path == opfpath: self.opf_name = name self.opf_dir = os.path.dirname(path) self.mime_map[name] = guess_type('a.opf') if not hasattr(self, 'opf_name'): raise InvalidBook('Could not locate opf file: %r'%opfpath) # Update mime map with data from the OPF self.refresh_mime_map()
def download_one(tdir, timeout, progress_report, data_uri_map, url): try: purl = urlparse(url) data_url_key = None with NamedTemporaryFile(dir=tdir, delete=False) as df: if purl.scheme == 'file': src = lopen(purl.path, 'rb') filename = os.path.basename(src) sz = (src.seek(0, os.SEEK_END), src.tell(), src.seek(0))[1] elif purl.scheme == 'data': prefix, payload = purl.path.split(',', 1) parts = prefix.split(';') if parts and parts[-1].lower() == 'base64': payload = re.sub(r'\s+', '', payload) payload = standard_b64decode(payload) else: payload = payload.encode('utf-8') seen_before = data_uri_map.get(payload) if seen_before is not None: return True, (url, filename, seen_before, guess_type(seen_before)) data_url_key = payload src = BytesIO(payload) sz = len(payload) ext = 'unknown' for x in parts: if '=' not in x and '/' in x: exts = mimetypes.guess_all_extensions(x) if exts: ext = exts[0] break filename = 'data-uri.' + ext else: src = urlopen(url, timeout=timeout) filename = get_filename(purl, src) sz = get_content_length(src) progress_report(url, 0, sz) dest = ProgressTracker(df, url, sz, progress_report) with closing(src): shutil.copyfileobj(src, dest) if data_url_key is not None: data_uri_map[data_url_key] = dest.name filename = sanitize_file_name(filename) mt = guess_type(filename) if mt in OEB_DOCS: raise ValueError('The external resource {} looks like a HTML document ({})'.format(url, filename)) if not mt or mt == 'application/octet-stream' or '.' not in filename: raise ValueError('The external resource {} is not of a known type'.format(url)) return True, (url, filename, dest.name, mt) except Exception as err: return False, (url, as_unicode(err))
def download_one(tdir, timeout, progress_report, url): try: purl = urlparse(url) with NamedTemporaryFile(dir=tdir, delete=False) as df: if purl.scheme == 'file': src = lopen(purl.path, 'rb') filename = os.path.basename(src) sz = (src.seek(0, os.SEEK_END), src.tell(), src.seek(0))[1] else: src = urlopen(url, timeout=timeout) filename = get_filename(purl, src) sz = get_content_length(src) progress_report(url, 0, sz) dest = ProgressTracker(df, url, sz, progress_report) with closing(src): shutil.copyfileobj(src, dest) filename = sanitize_file_name2(filename) mt = guess_type(filename) if mt in OEB_DOCS: raise ValueError('The external resource {} looks like a HTML document ({})'.format(url, filename)) if not mt or mt == 'application/octet-stream' or '.' not in filename: raise ValueError('The external resource {} is not of a known type'.format(url)) return True, (url, sanitize_file_name2(filename), dest.name, mt) except Exception as err: return False, (url, as_unicode(err))
def get_recommended_folders(container, names): ''' Return the folders that are recommended for the given filenames. The recommendation is based on where the majority of files of the same type are located in the container. If no files of a particular type are present, the recommended folder is assumed to be the folder containing the OPF file. ''' from calibre.ebooks.oeb.polish.utils import guess_type counts = defaultdict(Counter) for name, mt in container.mime_map.iteritems(): folder = name.rpartition('/')[0] if '/' in name else '' counts[mt_to_category(container, mt)][folder] += 1 try: opf_folder = counts['opf'].most_common(1)[0][0] except KeyError: opf_folder = '' recommendations = { category: counter.most_common(1)[0][0] for category, counter in counts.iteritems() } return { n: recommendations.get( mt_to_category(container, guess_type(os.path.basename(n))), opf_folder) for n in names }
def get_decoded_raw(name): from calibre.ebooks.chardet import xml_to_unicode, force_encoding with open(name, 'rb') as f: raw = f.read() syntax = syntax_from_mime(name, guess_type(name)) if syntax is None: try: raw = raw.decode('utf-8') except ValueError: pass elif syntax != 'raster_image': if syntax in {'html', 'xml'}: raw = xml_to_unicode(raw, verbose=True)[0] else: m = re.search(br"coding[:=]\s*([-\w.]+)", raw[:1024], flags=re.I) if m is not None and m.group(1) != '8bit': enc = m.group(1) if enc == b'unicode': enc = 'utf-8' else: enc = force_encoding(raw, verbose=True) try: raw = raw.decode(enc) except (LookupError, ValueError): try: raw = raw.decode('utf-8') except ValueError: pass return raw, syntax
def guess_type(self, name): # epubcheck complains if the mimetype for text documents is set to # text/html in EPUB 2 books. Sigh. ans = guess_type(name) if ans == 'text/html': ans = 'application/xhtml+xml' return ans
def __init__(self, path_to_ebook, tdir, log=None): log = log or default_log book_fmt, opfpath, input_fmt = extract_book(path_to_ebook, tdir, log=log) ContainerBase.__init__(self, tdir, opfpath, log) excluded_names = { name for name, mt in self.mime_map.iteritems() if name == self.opf_name or mt == guess_type('a.ncx') or name.startswith('META-INF/') } self.book_render_data = data = { 'version': self.RENDER_VERSION, 'toc':get_toc(self).as_dict, 'spine':[name for name, is_linear in self.spine_names], 'link_uid': uuid4(), 'is_comic': input_fmt.lower() in {'cbc', 'cbz', 'cbr', 'cb7'}, 'manifest': list(set(self.name_path_map) - excluded_names), } # Mark the spine as dirty since we have to ensure it is normalized for name in data['spine']: self.parsed(name), self.dirty(name) self.virtualize_resources() self.commit() for name in excluded_names: os.remove(self.name_path_map[name]) with lopen(os.path.join(self.root, 'calibre-book-manifest.json'), 'wb') as f: f.write(json.dumps(self.book_render_data, ensure_ascii=False).encode('utf-8'))
def manifest_key(x): mt = x.get('media-type', '') href = x.get('href', '') ext = href.rpartition('.')[-1].lower() cat = 1000 if mt in OEB_DOCS: cat = 0 elif mt == guess_type('a.ncx'): cat = 1 elif mt in OEB_STYLES: cat = 2 elif mt.startswith('image/'): cat = 3 elif ext in {'otf', 'ttf', 'woff'}: cat = 4 elif mt.startswith('audio/'): cat = 5 elif mt.startswith('video/'): cat = 6 if cat == 0: i = spine_ids.get(x.get('id', None), 1000000000) else: i = sort_key(href) return (cat, i)
def __init__(self, name, lnum, bad_idref=None, bad_mimetype=None): if bad_idref is not None: msg = _('The item identified as the Table of Contents (%s) does not exist') % bad_idref self.HELP = _('There is no item with id="%s" in the manifest.') % bad_idref else: msg = _('The item identified as the Table of Contents has an incorrect media-type (%s)') % bad_mimetype self.HELP = _('The media type for the table of contents must be %s') % guess_type('a.ncx') BaseError.__init__(self, msg, name, lnum)
def mt_to_category(container, mt): from calibre.ebooks.oeb.polish.utils import guess_type from calibre.ebooks.oeb.polish.container import OEB_FONTS from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES if mt in OEB_DOCS: category = 'text' elif mt in OEB_STYLES: category = 'style' elif mt in OEB_FONTS: category = 'font' elif mt == guess_type('a.opf'): category = 'opf' elif mt == guess_type('a.ncx'): category = 'toc' else: category = mt.partition('/')[0] return category
def find_existing_ncx_toc(container): toc = container.opf_xpath('//opf:spine/@toc') if toc: toc = container.manifest_id_map.get(toc[0], None) if not toc: ncx = guess_type('a.ncx') toc = container.manifest_type_map.get(ncx, [None])[0] return toc or None
def parsed(self, name): ans = self.parsed_cache.get(name, None) if ans is None: self.used_encoding = None mime = self.mime_map.get(name, guess_type(name)) ans = self.parse(self.name_path_map[name], mime) self.parsed_cache[name] = ans self.encoding_map[name] = self.used_encoding return ans
def mt_to_category(container, mt): from calibre.ebooks.oeb.polish.utils import guess_type from calibre.ebooks.oeb.polish.container import OEB_FONTS from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES if mt in OEB_DOCS: category = "text" elif mt in OEB_STYLES: category = "style" elif mt in OEB_FONTS: category = "font" elif mt == guess_type("a.opf"): category = "opf" elif mt == guess_type("a.ncx"): category = "toc" else: category = mt.partition("/")[0] return category
def check_ids(container): errors = [] mts = set(OEB_DOCS) | {guess_type("a.opf"), guess_type("a.ncx")} for name, mt in container.mime_map.iteritems(): if mt in mts: root = container.parsed(name) seen_ids = {} dups = {} for elem in root.xpath("//*[@id]"): eid = elem.get("id") if eid in seen_ids: if eid not in dups: dups[eid] = [seen_ids[eid]] dups[eid].append(elem.sourceline) else: seen_ids[eid] = elem.sourceline errors.extend(DuplicateId(name, eid, locs) for eid, locs in dups.iteritems()) return errors
def __init__(self, rootpath, opfpath, log, clone_data=None): self.root = clone_data['root'] if clone_data is not None else os.path.abspath(rootpath) self.log = log self.html_preprocessor = HTMLPreProcessor() self.css_preprocessor = CSSPreProcessor() self.tweak_mode = False self.parsed_cache = {} self.mime_map = {} self.name_path_map = {} self.dirtied = set() self.encoding_map = {} self.pretty_print = set() self.cloned = False self.cache_names = ('parsed_cache', 'mime_map', 'name_path_map', 'encoding_map', 'dirtied', 'pretty_print') if clone_data is not None: self.cloned = True for x in ('name_path_map', 'opf_name', 'mime_map', 'pretty_print', 'encoding_map', 'tweak_mode'): setattr(self, x, clone_data[x]) self.opf_dir = os.path.dirname(self.name_path_map[self.opf_name]) return # Map of relative paths with '/' separators from root of unzipped ePub # to absolute paths on filesystem with os-specific separators opfpath = os.path.abspath(os.path.realpath(opfpath)) for dirpath, _dirnames, filenames in os.walk(self.root): for f in filenames: path = join(dirpath, f) name = self.abspath_to_name(path) self.name_path_map[name] = path self.mime_map[name] = guess_type(path) # Special case if we have stumbled onto the opf if path == opfpath: self.opf_name = name self.opf_dir = os.path.dirname(path) self.mime_map[name] = guess_type('a.opf') if not hasattr(self, 'opf_name'): raise InvalidBook('Could not locate opf file: %r'%opfpath) # Update mime map with data from the OPF self.refresh_mime_map()
def find_existing_toc(container): toc = container.opf_xpath("//opf:spine/@toc") if toc: toc = container.manifest_id_map.get(toc[0], None) if not toc: ncx = guess_type("a.ncx") toc = container.manifest_type_map.get(ncx, [None])[0] if not toc: return None return toc
def pretty_all(container): for name, mt in container.mime_map.iteritems(): prettied = False if mt in OEB_DOCS: pretty_html_tree(container, container.parsed(name)) prettied = True elif mt in OEB_STYLES: container.parsed(name) prettied = True elif name == container.opf_name: root = container.parsed(name) pretty_opf(root) pretty_xml_tree(root) prettied = True elif mt in {guess_type('a.ncx'), guess_type('a.xml')}: pretty_xml_tree(container.parsed(name)) prettied = True if prettied: container.dirty(name)
def get_recommended_folders(container, names): ' Return the folders that are recommended for the given filenames ' from calibre.ebooks.oeb.polish.utils import guess_type counts = defaultdict(Counter) for name, mt in container.mime_map.iteritems(): folder = name.rpartition('/')[0] if '/' in name else '' counts[mt_to_category(container, mt)][folder] += 1 recommendations = {category:counter.most_common(1)[0][0] for category, counter in counts.iteritems()} return {n:recommendations.get(mt_to_category(container, guess_type(os.path.basename(n))), '') for n in names}
def __init__(self, path_to_ebook, tdir, log=None, book_hash=None): log = log or default_log book_fmt, opfpath, input_fmt = extract_book(path_to_ebook, tdir, log=log) ContainerBase.__init__(self, tdir, opfpath, log) excluded_names = { name for name, mt in self.mime_map.iteritems() if name == self.opf_name or mt == guess_type('a.ncx') or name.startswith('META-INF/') or name == 'mimetype' } raster_cover_name, titlepage_name = self.create_cover_page(input_fmt.lower()) self.book_render_data = data = { 'version': RENDER_VERSION, 'toc':get_toc(self).as_dict, 'spine':[name for name, is_linear in self.spine_names], 'link_uid': uuid4(), 'book_hash': book_hash, 'is_comic': input_fmt.lower() in {'cbc', 'cbz', 'cbr', 'cb7'}, 'raster_cover_name': raster_cover_name, 'title_page_name': titlepage_name, 'has_maths': False, 'total_length': 0, 'spine_length': 0, } # Mark the spine as dirty since we have to ensure it is normalized for name in data['spine']: self.parsed(name), self.dirty(name) self.transform_css() self.virtualized_names = set() self.virtualize_resources() def manifest_data(name): mt = (self.mime_map.get(name) or 'application/octet-stream').lower() ans = { 'size':os.path.getsize(self.name_path_map[name]), 'is_virtualized': name in self.virtualized_names, 'mimetype':mt, 'is_html': mt in OEB_DOCS, } if ans['is_html']: root = self.parsed(name) ans['length'] = l = get_length(root) self.book_render_data['total_length'] += l if name in data['spine']: self.book_render_data['spine_length'] += l ans['has_maths'] = hm = check_for_maths(root) if hm: self.book_render_data['has_maths'] = True return ans data['files'] = {name:manifest_data(name) for name in set(self.name_path_map) - excluded_names} self.commit() for name in excluded_names: os.remove(self.name_path_map[name]) with lopen(os.path.join(self.root, 'calibre-book-manifest.json'), 'wb') as f: f.write(json.dumps(self.book_render_data, ensure_ascii=False).encode('utf-8'))
def check_ids(container): errors = [] mts = set(OEB_DOCS) | {guess_type('a.opf'), guess_type('a.ncx')} for name, mt in iteritems(container.mime_map): if mt in mts: root = container.parsed(name) seen_ids = {} dups = {} for elem in root.xpath('//*[@id]'): eid = elem.get('id') if eid in seen_ids: if eid not in dups: dups[eid] = [seen_ids[eid]] dups[eid].append(elem.sourceline) else: seen_ids[eid] = elem.sourceline if eid and valid_id.match(eid) is None: errors.append(InvalidId(name, elem.sourceline, eid)) errors.extend(DuplicateId(name, eid, locs) for eid, locs in iteritems(dups)) return errors
def get_filename(original_url_parsed, response): ans = get_download_filename_from_response(response) or posixpath.basename(original_url_parsed.path) or 'unknown' ct = response.info().get('Content-Type', '') if ct: ct = cgi.parse_header(ct)[0].lower() if ct: mt = guess_type(ans) if mt != ct: exts = mimetypes.guess_all_extensions(ct) if exts: ans += exts[0] return ans
def pretty_all(container): ' Pretty print all HTML/CSS/XML files in the container ' xml_types = {guess_type('a.ncx'), guess_type('a.xml'), guess_type('a.svg')} for name, mt in iteritems(container.mime_map): prettied = False if mt in OEB_DOCS: pretty_html_tree(container, container.parsed(name)) prettied = True elif mt in OEB_STYLES: container.parsed(name) prettied = True elif name == container.opf_name: root = container.parsed(name) pretty_opf(root) pretty_xml_tree(root) prettied = True elif mt in xml_types: pretty_xml_tree(container.parsed(name)) prettied = True if prettied: container.dirty(name)
def replace_links(self, name, replace_func): ''' Replace all links in name using replace_func, which must be a callable that accepts a URL and returns the replaced URL. It must also have a 'replaced' attribute that is set to True if any actual replacement is done. Convenient ways of creating such callables are using the :class:`LinkReplacer` and :class:`LinkRebaser` classes. ''' media_type = self.mime_map.get(name, guess_type(name)) if name == self.opf_name: for elem in self.opf_xpath('//*[@href]'): elem.set('href', replace_func(elem.get('href'))) elif media_type.lower() in OEB_DOCS: rewrite_links(self.parsed(name), replace_func) elif media_type.lower() in OEB_STYLES: replaceUrls(self.parsed(name), replace_func) elif media_type.lower() == guess_type('toc.ncx'): for elem in self.parsed(name).xpath('//*[@src]'): elem.set('src', replace_func(elem.get('src'))) if replace_func.replaced: self.dirty(name) return replace_func.replaced
def requestStarted(self, rq): if bytes(rq.requestMethod()) != b'GET': return self.fail_request(rq, rq.RequestDenied) url = rq.requestUrl() host = url.host() if host not in self.allowed_hosts or url.scheme() != FAKE_PROTOCOL: return self.fail_request(rq) name = url.path()[1:] if host == SANDBOX_HOST and not name.startswith('book/'): return self.fail_request(rq) if name.startswith('book/'): name = name.partition('/')[2] if name == '__index__': send_reply(rq, 'text/html', b'<div>\xa0</div>') return elif name == '__popup__': send_reply( rq, 'text/html', b'<div id="calibre-viewer-footnote-iframe">\xa0</div>') return try: data, mime_type = get_data(name) if data is None: rq.fail(rq.UrlNotFound) return data = as_bytes(data) mime_type = { # Prevent warning in console about mimetype of fonts 'application/vnd.ms-opentype': 'application/x-font-ttf', 'application/x-font-truetype': 'application/x-font-ttf', 'application/font-sfnt': 'application/x-font-ttf', }.get(mime_type, mime_type) send_reply(rq, mime_type, data) except Exception: import traceback traceback.print_exc() return self.fail_request(rq, rq.RequestFailed) elif name == 'manifest': data = b'[' + set_book_path.manifest + b',' + set_book_path.metadata + b']' send_reply(rq, set_book_path.manifest_mime, data) elif name == 'reader-background': mt, data = background_image() if data: send_reply(rq, mt, data) else: rq.fail(rq.UrlNotFound) elif name.startswith('mathjax/'): from calibre.gui2.viewer.mathjax import monkeypatch_mathjax if name == 'mathjax/manifest.json': if self.mathjax_manifest is None: import json from calibre.srv.books import get_mathjax_manifest self.mathjax_manifest = as_bytes( json.dumps(get_mathjax_manifest()['files'])) send_reply(rq, 'application/json', self.mathjax_manifest) return path = os.path.abspath(os.path.join(self.mathjax_dir, '..', name)) if path.startswith(self.mathjax_dir): mt = guess_type(name) try: with lopen(path, 'rb') as f: raw = f.read() except EnvironmentError as err: prints( "Failed to get mathjax file: {} with error: {}".format( name, err)) return self.fail_request(rq, rq.RequestFailed) if 'MathJax.js' in name: # raw = open(os.path.expanduser('~/work/mathjax/unpacked/MathJax.js')).read() raw = monkeypatch_mathjax( raw.decode('utf-8')).encode('utf-8') send_reply(rq, mt, raw) elif not name: send_reply(rq, 'text/html', viewer_html()) else: return self.fail_request(rq)
def raw_data(self, name, decode=True): ans = self.open(name).read() mime = self.mime_map.get(name, guess_type(name)) if decode and (mime in OEB_STYLES or mime in OEB_DOCS or mime == 'text/plain' or mime[-4:] in {'+xml', '/xml'}): ans = self.decode(ans) return ans
def process_exploded_book(book_fmt, opfpath, input_fmt, tdir, render_manager, log=None, book_hash=None, save_bookmark_data=False, book_metadata=None, virtualize_resources=True): log = log or default_log container = SimpleContainer(tdir, opfpath, log) input_plugin = plugin_for_input_format(input_fmt) is_comic = bool(getattr(input_plugin, 'is_image_collection', False)) def needs_work(mt): return mt in OEB_STYLES or mt in OEB_DOCS or mt == 'image/svg+xml' def work_priority(name): # ensure workers with large files or stylesheets # have the less names size = os.path.getsize(container.name_path_map[name]), is_html = container.mime_map.get(name) in OEB_DOCS return (0 if is_html else 1), size if not is_comic: render_manager.launch_workers( tuple(n for n, mt in iteritems(container.mime_map) if needs_work(mt)), container) bookmark_data = None if save_bookmark_data: bm_file = 'META-INF/calibre_bookmarks.txt' if container.exists(bm_file): with container.open(bm_file, 'rb') as f: bookmark_data = f.read() # We do not add zero byte sized files as the IndexedDB API in the # browser has no good way to distinguish between zero byte files and # load failures. excluded_names = { name for name, mt in iteritems(container.mime_map) if name == container.opf_name or mt == guess_type('a.ncx') or name.startswith('META-INF/') or name == 'mimetype' or not container.has_name_and_is_not_empty(name) } raster_cover_name, titlepage_name = create_cover_page( container, input_fmt.lower(), is_comic, book_metadata) toc = get_toc(container, verify_destinations=False).to_dict(count()) if not toc or not toc.get('children'): toc = from_xpaths(container, ['//h:h1', '//h:h2', '//h:h3']).to_dict(count()) spine = [name for name, is_linear in container.spine_names] spineq = frozenset(spine) landmarks = [l for l in get_landmarks(container) if l['dest'] in spineq] book_render_data = { 'version': RENDER_VERSION, 'toc': toc, 'book_format': book_fmt, 'spine': spine, 'link_uid': uuid4(), 'book_hash': book_hash, 'is_comic': is_comic, 'raster_cover_name': raster_cover_name, 'title_page_name': titlepage_name, 'has_maths': False, 'total_length': 0, 'spine_length': 0, 'toc_anchor_map': toc_anchor_map(toc), 'landmarks': landmarks, 'link_to_map': {}, } names = sorted( (n for n, mt in iteritems(container.mime_map) if needs_work(mt)), key=work_priority) results = render_manager( names, (tdir, opfpath, virtualize_resources, book_render_data['link_uid'], container.data_for_clone()), container) ltm = book_render_data['link_to_map'] html_data = {} virtualized_names = set() def merge_ltm(dest, src): for k, v in iteritems(src): if k in dest: dest[k] |= v else: dest[k] = v for link_to_map, hdata, vnames in results: html_data.update(hdata) virtualized_names |= vnames for k, v in iteritems(link_to_map): if k in ltm: merge_ltm(ltm[k], v) else: ltm[k] = v def manifest_data(name): mt = (container.mime_map.get(name) or 'application/octet-stream').lower() ans = { 'size': os.path.getsize(container.name_path_map[name]), 'is_virtualized': name in virtualized_names, 'mimetype': mt, 'is_html': mt in OEB_DOCS, } if ans['is_html']: data = html_data[name] ans['length'] = l = data['length'] book_render_data['total_length'] += l if name in book_render_data['spine']: book_render_data['spine_length'] += l ans['has_maths'] = hm = data['has_maths'] if hm: book_render_data['has_maths'] = True ans['anchor_map'] = data['anchor_map'] return ans book_render_data['files'] = { name: manifest_data(name) for name in set(container.name_path_map) - excluded_names } container.commit() for name in excluded_names: os.remove(container.name_path_map[name]) ltm = book_render_data['link_to_map'] for name, amap in iteritems(ltm): for k, v in tuple(iteritems(amap)): amap[k] = tuple(v) # needed for JSON serialization data = as_bytes(json.dumps(book_render_data, ensure_ascii=False)) with lopen(os.path.join(container.root, 'calibre-book-manifest.json'), 'wb') as f: f.write(data) return container, bookmark_data
def image_names(self): img_types = {guess_type('a.' + x) for x in ('png', 'jpeg', 'gif')} for name, mt in iteritems(self.container.mime_map): if mt.lower() in img_types: yield name
def create_book(mi, path, fmt='epub', opf_name='metadata.opf', html_name='start.xhtml', toc_name='toc.ncx'): ''' Create an empty book in the specified format at the specified location. ''' path = os.path.abspath(path) lang = 'und' opf = metadata_to_opf(mi, as_string=False) for l in opf.xpath('//*[local-name()="language"]'): if l.text: lang = l.text break lang = lang_as_iso639_1(lang) or lang opfns = OPF_NAMESPACES['opf'] m = opf.makeelement('{%s}manifest' % opfns) opf.insert(1, m) i = m.makeelement('{%s}item' % opfns, href=html_name, id='start') i.set('media-type', guess_type('a.xhtml')) m.append(i) i = m.makeelement('{%s}item' % opfns, href=toc_name, id='ncx') i.set('media-type', guess_type(toc_name)) m.append(i) s = opf.makeelement('{%s}spine' % opfns, toc="ncx") opf.insert(2, s) i = s.makeelement('{%s}itemref' % opfns, idref='start') s.append(i) CONTAINER = '''\ <?xml version="1.0"?> <container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container"> <rootfiles> <rootfile full-path="{0}" media-type="application/oebps-package+xml"/> </rootfiles> </container> '''.format(prepare_string_for_xml(opf_name, True)).encode('utf-8') HTML = '''\ <?xml version='1.0' encoding='utf-8'?> <html lang="{1}" xmlns="http://www.w3.org/1999/xhtml"> <head> <title>{0}</title> </head> <body> <h1>{0}</h1> </body> </html> '''.format(prepare_string_for_xml(mi.title), lang).encode('utf-8') h = parse(HTML) pretty_html_tree(None, h) HTML = serialize(h, 'text/html') ncx = etree.tostring(create_toc(mi, opf, html_name, lang), encoding='utf-8', xml_declaration=True, pretty_print=True) pretty_xml_tree(opf) opf = etree.tostring(opf, encoding='utf-8', xml_declaration=True, pretty_print=True) if fmt == 'azw3': with TemporaryDirectory('create-azw3') as tdir, CurrentDir(tdir): for name, data in ((opf_name, opf), (html_name, HTML), (toc_name, ncx)): with open(name, 'wb') as f: f.write(data) c = Container(os.path.dirname(os.path.abspath(opf_name)), opf_name, DevNull()) opf_to_azw3(opf_name, path, c) else: with ZipFile(path, 'w', compression=ZIP_STORED) as zf: zf.writestr('mimetype', b'application/epub+zip', compression=ZIP_STORED) zf.writestr('META-INF/', b'', 0755) zf.writestr('META-INF/container.xml', CONTAINER) zf.writestr(opf_name, opf) zf.writestr(html_name, HTML) zf.writestr(toc_name, ncx)
def __init__(self, book_fmt, opfpath, input_fmt, tdir, log=None, book_hash=None, save_bookmark_data=False, book_metadata=None, allow_no_cover=True, virtualize_resources=True): log = log or default_log self.allow_no_cover = allow_no_cover ContainerBase.__init__(self, tdir, opfpath, log) self.book_metadata = book_metadata input_plugin = plugin_for_input_format(input_fmt) self.is_comic = bool( getattr(input_plugin, 'is_image_collection', False)) if save_bookmark_data: bm_file = 'META-INF/calibre_bookmarks.txt' self.bookmark_data = None if self.exists(bm_file): with self.open(bm_file, 'rb') as f: self.bookmark_data = f.read() # We do not add zero byte sized files as the IndexedDB API in the # browser has no good way to distinguish between zero byte files and # load failures. excluded_names = { name for name, mt in iteritems(self.mime_map) if name == self.opf_name or mt == guess_type('a.ncx') or name.startswith('META-INF/') or name == 'mimetype' or not self.has_name_and_is_not_empty(name) } raster_cover_name, titlepage_name = self.create_cover_page( input_fmt.lower()) toc = get_toc(self).to_dict(count()) if not toc or not toc.get('children'): toc = from_xpaths(self, ['//h:h1', '//h:h2', '//h:h3']).to_dict(count()) spine = [name for name, is_linear in self.spine_names] spineq = frozenset(spine) landmarks = [l for l in get_landmarks(self) if l['dest'] in spineq] self.book_render_data = data = { 'version': RENDER_VERSION, 'toc': toc, 'book_format': book_fmt, 'spine': spine, 'link_uid': uuid4(), 'book_hash': book_hash, 'is_comic': self.is_comic, 'raster_cover_name': raster_cover_name, 'title_page_name': titlepage_name, 'has_maths': False, 'total_length': 0, 'spine_length': 0, 'toc_anchor_map': toc_anchor_map(toc), 'landmarks': landmarks, 'link_to_map': {}, } # Mark the spine as dirty since we have to ensure it is normalized for name in data['spine']: self.parsed(name), self.dirty(name) self.virtualized_names = set() self.transform_all(virtualize_resources) def manifest_data(name): mt = (self.mime_map.get(name) or 'application/octet-stream').lower() ans = { 'size': os.path.getsize(self.name_path_map[name]), 'is_virtualized': name in self.virtualized_names, 'mimetype': mt, 'is_html': mt in OEB_DOCS, } if ans['is_html']: root = self.parsed(name) ans['length'] = l = get_length(root) self.book_render_data['total_length'] += l if name in data['spine']: self.book_render_data['spine_length'] += l ans['has_maths'] = hm = check_for_maths(root) if hm: self.book_render_data['has_maths'] = True ans['anchor_map'] = anchor_map(root) return ans data['files'] = { name: manifest_data(name) for name in set(self.name_path_map) - excluded_names } self.commit() for name in excluded_names: os.remove(self.name_path_map[name]) data = json.dumps(self.book_render_data, ensure_ascii=False) if not isinstance(data, bytes): data = data.encode('utf-8') with lopen(os.path.join(self.root, 'calibre-book-manifest.json'), 'wb') as f: f.write(data)
def subset_all_fonts(container, font_stats, report): remove = set() total_old = total_new = 0 changed = False for name, mt in container.mime_map.iteritems(): if (mt in OEB_FONTS or name.rpartition('.')[-1].lower() in {'otf', 'ttf'}) and mt != guess_type('a.woff'): chars = font_stats.get(name, set()) with container.open(name, 'rb') as f: f.seek(0, os.SEEK_END) total_old += f.tell() if not chars: remove.add(name) report('Removed unused font: %s' % name) continue with container.open(name, 'r+b') as f: raw = f.read() font_name = get_font_names(raw)[-1] warnings = [] container.log('Subsetting font: %s' % (font_name or name)) try: nraw, old_sizes, new_sizes = subset(raw, chars, warnings=warnings) except UnsupportedFont as e: container.log.warning( 'Unsupported font: %s, ignoring. Error: %s' % (name, as_unicode(e))) continue for w in warnings: container.log.warn(w) olen = sum(old_sizes.itervalues()) nlen = sum(new_sizes.itervalues()) total_new += len(nraw) if nlen == olen: report('The font %s was already subset' % font_name) else: report( 'Decreased the font %s to %.1f%% of its original size' % (font_name, nlen / olen * 100)) changed = True f.seek(0), f.truncate(), f.write(nraw) for name in remove: container.remove_item(name) changed = True if remove: for name, mt in container.mime_map.iteritems(): if mt in OEB_STYLES: sheet = container.parsed(name) if remove_font_face_rules(container, sheet, remove, name): container.dirty(name) elif mt in OEB_DOCS: for style in XPath('//h:style')(container.parsed(name)): if style.get('type', 'text/css') == 'text/css' and style.text: sheet = container.parse_css(style.text, name) if remove_font_face_rules(container, sheet, remove, name): style.text = sheet.cssText container.dirty(name) if total_old > 0: report('Reduced total font size to %.1f%% of original' % (total_new / total_old * 100)) else: report('No embedded fonts found') return changed
def check_opf(container): errors = [] if container.opf.tag != OPF('package'): err = BaseError(_('The OPF does not have the correct root element'), container.opf_name) err.HELP = xml( _('The opf must have the root element <package> in namespace {0}, like this: <package xmlns="{0}">' )).format(OPF2_NS) errors.append(err) for tag in ('metadata', 'manifest', 'spine'): if not container.opf_xpath('/opf:package/opf:' + tag): errors.append(MissingSection(container.opf_name, tag)) all_ids = set(container.opf_xpath('//*/@id')) for elem in container.opf_xpath('//*[@idref]'): if elem.get('idref') not in all_ids: errors.append( IncorrectIdref(container.opf_name, elem.get('idref'), elem.sourceline)) nl_items = [ elem.sourceline for elem in container.opf_xpath( '//opf:spine/opf:itemref[@linear="no"]') ] if nl_items: errors.append(NonLinearItems(container.opf_name, nl_items)) seen, dups = {}, {} for item in container.opf_xpath( '/opf:package/opf:manifest/opf:item[@href]'): href = item.get('href') hname = container.href_to_name(href, container.opf_name) if not hname or not container.exists(hname): errors.append( MissingHref(container.opf_name, href, item.sourceline)) if href in seen: if href not in dups: dups[href] = [seen[href]] dups[href].append(item.sourceline) else: seen[href] = item.sourceline errors.extend( DuplicateHref(container.opf_name, eid, locs) for eid, locs in dups.iteritems()) seen, dups = {}, {} for item in container.opf_xpath( '/opf:package/opf:spine/opf:itemref[@idref]'): ref = item.get('idref') if ref in seen: if ref not in dups: dups[ref] = [seen[ref]] dups[ref].append(item.sourceline) else: seen[ref] = item.sourceline errors.extend( DuplicateHref(container.opf_name, eid, locs, for_spine=True) for eid, locs in dups.iteritems()) spine = container.opf_xpath('/opf:package/opf:spine[@toc]') if spine: spine = spine[0] mitems = [ x for x in container.opf_xpath( '/opf:package/opf:manifest/opf:item[@id]') if x.get('id') == spine.get('toc') ] if mitems: mitem = mitems[0] if mitem.get('media-type', '') != guess_type('a.ncx'): errors.append( IncorrectToc(container.opf_name, mitem.sourceline, bad_mimetype=mitem.get('media-type'))) else: errors.append( IncorrectToc(container.opf_name, spine.sourceline, bad_idref=spine.get('toc'))) covers = container.opf_xpath( '/opf:package/opf:metadata/opf:meta[@name="cover"]') if len(covers) > 0: if len(covers) > 1: errors.append( MultipleCovers(container.opf_name, [c.sourceline for c in covers])) manifest_ids = set( container.opf_xpath('/opf:package/opf:manifest/opf:item/@id')) for cover in covers: if cover.get('content', None) not in manifest_ids: errors.append( IncorrectCover(container.opf_name, cover.sourceline, cover.get('content', ''))) raw = etree.tostring(cover) try: n, c = raw.index('name="'), raw.index('content="') except ValueError: n = c = -1 if n > -1 and c > -1 and n > c: errors.append(NookCover(container.opf_name, cover.sourceline)) uid = container.opf.get('unique-identifier', None) if uid is None or not container.opf_xpath( '/opf:package/opf:metadata/dc:identifier[@id=%r]' % uid): errors.append(NoUID(container.opf_name)) for item, name, linear in container.spine_iter: mt = container.mime_map[name] if mt != XHTML_MIME: iid = item.get('idref', None) lnum = None if iid: mitem = container.opf_xpath( '/opf:package/opf:manifest/opf:item[@id=%r]' % iid) if mitem: lnum = mitem[0].sourceline else: iid = None errors.append(BadSpineMime(name, iid, mt, lnum, container.opf_name)) return errors
def create_book(mi, path, fmt='epub', opf_name='metadata.opf', html_name='start.xhtml', toc_name='toc.ncx'): ''' Create an empty book in the specified format at the specified location. ''' if fmt not in valid_empty_formats: raise ValueError('Cannot create empty book in the %s format' % fmt) if fmt == 'txt': with open(path, 'wb') as f: if not mi.is_null('title'): f.write(mi.title) return if fmt == 'docx': from calibre.ebooks.conversion.plumber import Plumber from calibre.ebooks.docx.writer.container import DOCX from calibre.utils.logging import default_log p = Plumber('a.docx', 'b.docx', default_log) p.setup_options() # Use the word default of one inch page margins for x in 'left right top bottom'.split(): setattr(p.opts, 'margin_' + x, 72) DOCX(p.opts, default_log).write(path, mi, create_empty_document=True) return path = os.path.abspath(path) lang = 'und' opf = metadata_to_opf(mi, as_string=False) for l in opf.xpath('//*[local-name()="language"]'): if l.text: lang = l.text break lang = lang_as_iso639_1(lang) or lang opfns = OPF_NAMESPACES['opf'] m = opf.makeelement('{%s}manifest' % opfns) opf.insert(1, m) i = m.makeelement('{%s}item' % opfns, href=html_name, id='start') i.set('media-type', guess_type('a.xhtml')) m.append(i) i = m.makeelement('{%s}item' % opfns, href=toc_name, id='ncx') i.set('media-type', guess_type(toc_name)) m.append(i) s = opf.makeelement('{%s}spine' % opfns, toc="ncx") opf.insert(2, s) i = s.makeelement('{%s}itemref' % opfns, idref='start') s.append(i) CONTAINER = '''\ <?xml version="1.0"?> <container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container"> <rootfiles> <rootfile full-path="{0}" media-type="application/oebps-package+xml"/> </rootfiles> </container> '''.format(prepare_string_for_xml(opf_name, True)).encode('utf-8') HTML = P('templates/new_book.html', data=True).decode('utf-8').replace( '_LANGUAGE_', prepare_string_for_xml(lang, True)).replace( '_TITLE_', prepare_string_for_xml(mi.title)).replace( '_AUTHORS_', prepare_string_for_xml(authors_to_string( mi.authors))).encode('utf-8') h = parse(HTML) pretty_html_tree(None, h) HTML = serialize(h, 'text/html') ncx = etree.tostring(create_toc(mi, opf, html_name, lang), encoding='utf-8', xml_declaration=True, pretty_print=True) pretty_xml_tree(opf) opf = etree.tostring(opf, encoding='utf-8', xml_declaration=True, pretty_print=True) if fmt == 'azw3': with TemporaryDirectory('create-azw3') as tdir, CurrentDir(tdir): for name, data in ((opf_name, opf), (html_name, HTML), (toc_name, ncx)): with open(name, 'wb') as f: f.write(data) c = Container(os.path.dirname(os.path.abspath(opf_name)), opf_name, DevNull()) opf_to_azw3(opf_name, path, c) else: with ZipFile(path, 'w', compression=ZIP_STORED) as zf: zf.writestr('mimetype', b'application/epub+zip', compression=ZIP_STORED) zf.writestr('META-INF/', b'', 0o755) zf.writestr('META-INF/container.xml', CONTAINER) zf.writestr(opf_name, opf) zf.writestr(html_name, HTML) zf.writestr(toc_name, ncx)
def check_links(container): links_map = defaultdict(set) xml_types = {guess_type('a.opf'), guess_type('a.ncx')} errors = [] a = errors.append def fl(x): x = repr(x) if x.startswith('u'): x = x[1:] return x for name, mt in iteritems(container.mime_map): if mt in OEB_DOCS or mt in OEB_STYLES or mt in xml_types: for href, lnum, col in container.iterlinks(name): if not href: a(EmptyLink(_('The link is empty'), name, lnum, col)) try: tname = container.href_to_name(href, name) except ValueError: tname = None # Absolute paths to files on another drive in windows cause this if tname is not None: if container.exists(tname): if tname in container.mime_map: links_map[name].add(tname) else: # Filesystem says the file exists, but it is not in # the mime_map, so either there is a case mismatch # or the link is a directory apath = container.name_to_abspath(tname) if os.path.isdir(apath): a( BadLink( _('The linked resource %s is a folder') % fl(href), name, lnum, col)) else: a( CaseMismatch( href, actual_case_for_name(container, tname), name, lnum, col)) else: cname = corrected_case_for_name(container, tname) if cname is not None: a(CaseMismatch(href, cname, name, lnum, col)) else: a( DanglingLink( _('The linked resource %s does not exist') % fl(href), tname, name, lnum, col)) else: purl = urlparse(href) if purl.scheme == 'file': a( FileLink( _('The link %s is a file:// URL') % fl(href), name, lnum, col)) elif purl.path and purl.path.startswith( '/') and purl.scheme in {'', 'file'}: a( LocalLink( _('The link %s points to a file outside the book' ) % fl(href), name, lnum, col)) elif purl.path and purl.scheme in { '', 'file' } and ':' in urlunquote(purl.path): a( InvalidCharInLink( _('The link %s contains a : character, this will cause errors on Windows computers' ) % fl(href), name, lnum, col)) spine_docs = {name for name, linear in container.spine_names} spine_styles = { tname for name in spine_docs for tname in links_map[name] if container.mime_map.get(tname, None) in OEB_STYLES } num = -1 while len(spine_styles) > num: # Handle import rules in stylesheets num = len(spine_styles) spine_styles |= { tname for name in spine_styles for tname in links_map[name] if container.mime_map.get(tname, None) in OEB_STYLES } seen = set(OEB_DOCS) | set(OEB_STYLES) spine_resources = { tname for name in spine_docs | spine_styles for tname in links_map[name] if container.mime_map[tname] not in seen } unreferenced = set() cover_name = container.guide_type_map.get('cover', None) nav_items = frozenset(container.manifest_items_with_property('nav')) for name, mt in iteritems(container.mime_map): if mt in OEB_STYLES and name not in spine_styles: a(UnreferencedResource(name)) elif mt in OEB_DOCS and name not in spine_docs and name not in nav_items: a(UnreferencedDoc(name)) elif (mt in OEB_FONTS or mt.partition('/')[0] in { 'image', 'audio', 'video' }) and name not in spine_resources and name != cover_name: if mt.partition('/')[ 0] == 'image' and name == get_raster_cover_name(container): continue a(UnreferencedResource(name)) else: continue unreferenced.add(name) manifest_names = set(itervalues(container.manifest_id_map)) for name in container.mime_map: if name not in manifest_names and not container.ok_to_be_unmanifested( name): a(Unmanifested(name, unreferenced=name in unreferenced)) if name == 'META-INF/calibre_bookmarks.txt': a(Bookmarks(name)) return errors
def check_opf(container): errors = [] opf_version = container.opf_version_parsed if container.opf.tag != OPF('package'): err = BaseError(_('The OPF does not have the correct root element'), container.opf_name, container.opf.sourceline) err.HELP = xml( _('The opf must have the root element <package> in namespace {0}, like this: <package xmlns="{0}">' )).format(OPF2_NS) errors.append(err) elif container.opf.get( 'version') is None and container.book_type == 'epub': err = BaseError(_('The OPF does not have a version'), container.opf_name, container.opf.sourceline) err.HELP = xml( _('The <package> tag in the OPF must have a version attribute. This is usually version="2.0" for EPUB2 and AZW3 and version="3.0" for EPUB3' )) errors.append(err) for tag in ('metadata', 'manifest', 'spine'): if not container.opf_xpath('/opf:package/opf:' + tag): errors.append(MissingSection(container.opf_name, tag)) all_ids = set(container.opf_xpath('//*/@id')) if '' in all_ids: for empty_id_tag in container.opf_xpath('//*[@id=""]'): errors.append(EmptyID(container.opf_name, empty_id_tag.sourceline)) all_ids.discard('') for elem in container.opf_xpath('//*[@idref]'): if elem.get('idref') not in all_ids: errors.append( IncorrectIdref(container.opf_name, elem.get('idref'), elem.sourceline)) nl_items = [ elem.sourceline for elem in container.opf_xpath( '//opf:spine/opf:itemref[@linear="no"]') ] if nl_items: errors.append(NonLinearItems(container.opf_name, nl_items)) seen, dups = {}, {} for item in container.opf_xpath('/opf:package/opf:manifest/opf:item'): href = item.get('href', None) if href is None: errors.append( NoHref(container.opf_name, item.get('id', None), item.sourceline)) else: hname = container.href_to_name(href, container.opf_name) if not hname or not container.exists(hname): errors.append( MissingHref(container.opf_name, href, item.sourceline)) if href in seen: if href not in dups: dups[href] = [seen[href]] dups[href].append(item.sourceline) else: seen[href] = item.sourceline errors.extend( DuplicateHref(container.opf_name, eid, locs) for eid, locs in iteritems(dups)) seen, dups = {}, {} for item in container.opf_xpath( '/opf:package/opf:spine/opf:itemref[@idref]'): ref = item.get('idref') if ref in seen: if ref not in dups: dups[ref] = [seen[ref]] dups[ref].append(item.sourceline) else: seen[ref] = item.sourceline errors.extend( DuplicateHref(container.opf_name, eid, locs, for_spine=True) for eid, locs in iteritems(dups)) spine = container.opf_xpath('/opf:package/opf:spine[@toc]') if spine: spine = spine[0] mitems = [ x for x in container.opf_xpath( '/opf:package/opf:manifest/opf:item[@id]') if x.get('id') == spine.get('toc') ] if mitems: mitem = mitems[0] if mitem.get('media-type', '') != guess_type('a.ncx'): errors.append( IncorrectToc(container.opf_name, mitem.sourceline, bad_mimetype=mitem.get('media-type'))) else: errors.append( IncorrectToc(container.opf_name, spine.sourceline, bad_idref=spine.get('toc'))) else: spine = container.opf_xpath('/opf:package/opf:spine') if spine: spine = spine[0] ncx = container.manifest_type_map.get(guess_type('a.ncx')) if ncx: ncx_name = ncx[0] rmap = {v: k for k, v in iteritems(container.manifest_id_map)} ncx_id = rmap.get(ncx_name) if ncx_id: errors.append( MissingNCXRef(container.opf_name, spine.sourceline, ncx_id)) if opf_version.major > 2: existing_nav = find_existing_nav_toc(container) if existing_nav is None: errors.append(MissingNav(container.opf_name, 0)) else: toc = parse_nav(container, existing_nav) if len(toc) == 0: errors.append(EmptyNav(existing_nav, 0)) covers = container.opf_xpath( '/opf:package/opf:metadata/opf:meta[@name="cover"]') if len(covers) > 0: if len(covers) > 1: errors.append( MultipleCovers(container.opf_name, [c.sourceline for c in covers])) manifest_ids = set( container.opf_xpath('/opf:package/opf:manifest/opf:item/@id')) for cover in covers: if cover.get('content', None) not in manifest_ids: errors.append( IncorrectCover(container.opf_name, cover.sourceline, cover.get('content', ''))) raw = etree.tostring(cover) try: n, c = raw.index(b'name="'), raw.index(b'content="') except ValueError: n = c = -1 if n > -1 and c > -1 and n > c: errors.append(NookCover(container.opf_name, cover.sourceline)) uid = container.opf.get('unique-identifier', None) if uid is None or not container.opf_xpath( '/opf:package/opf:metadata/dc:identifier[@id=%r]' % uid): errors.append(NoUID(container.opf_name)) for elem in container.opf_xpath('/opf:package/opf:metadata/dc:identifier'): if not elem.text or not elem.text.strip(): errors.append(EmptyIdentifier(container.opf_name, elem.sourceline)) for item, name, linear in container.spine_iter: mt = container.mime_map[name] if mt != XHTML_MIME: iid = item.get('idref', None) lnum = None if iid: mitem = container.opf_xpath( '/opf:package/opf:manifest/opf:item[@id=%r]' % iid) if mitem: lnum = mitem[0].sourceline else: iid = None errors.append(BadSpineMime(name, iid, mt, lnum, container.opf_name)) return errors
def iter_subsettable_fonts(container): woff_font_types = guess_type('a.woff'), guess_type('a.woff2') for name, mt in iteritems(container.mime_map): if (mt in OEB_FONTS or name.rpartition('.')[-1].lower() in {'otf', 'ttf'}) and mt not in woff_font_types: yield name, mt
rewrite_links, iterlinks, itercsslinks, urlquote, urlunquote) from calibre.ebooks.oeb.polish.errors import InvalidBook, DRMError from calibre.ebooks.oeb.polish.parsing import parse as parse_html_tweak from calibre.ebooks.oeb.polish.utils import PositionFinder, CommentFinder, guess_type from calibre.ebooks.oeb.parse_utils import NotHTML, parse_html, RECOVER_PARSER from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile from calibre.utils.filenames import nlinks_file, hardlink_file from calibre.utils.ipc.simple_worker import fork_job, WorkerError from calibre.utils.logging import default_log from calibre.utils.zipfile import ZipFile exists, join, relpath = os.path.exists, os.path.join, os.path.relpath OEB_FONTS = { guess_type('a.ttf'), guess_type('b.otf'), guess_type('a.woff'), 'application/x-font-ttf', 'application/x-font-otf' } OPF_NAMESPACES = {'opf': OPF2_NS, 'dc': DC11_NS} class CSSPreProcessor(cssp): def __call__(self, data): return self.MS_PAT.sub(self.ms_sub, data) def clone_dir(src, dest): ' Clone a directory using hard links for the files, dest must already exist ' for x in os.listdir(src): dpath = os.path.join(dest, x)
def __init__(self, path_to_ebook, tdir, log=None, book_hash=None): log = log or default_log book_fmt, opfpath, input_fmt = extract_book(path_to_ebook, tdir, log=log) ContainerBase.__init__(self, tdir, opfpath, log) excluded_names = { name for name, mt in self.mime_map.iteritems() if name == self.opf_name or mt == guess_type('a.ncx') or name.startswith('META-INF/') or name == 'mimetype' } raster_cover_name, titlepage_name = self.create_cover_page(input_fmt.lower()) toc = get_toc(self).to_dict(count()) spine = [name for name, is_linear in self.spine_names] spineq = frozenset(spine) landmarks = [l for l in get_landmarks(self) if l['dest'] in spineq] self.book_render_data = data = { 'version': RENDER_VERSION, 'toc':toc, 'spine':spine, 'link_uid': uuid4(), 'book_hash': book_hash, 'is_comic': input_fmt.lower() in {'cbc', 'cbz', 'cbr', 'cb7'}, 'raster_cover_name': raster_cover_name, 'title_page_name': titlepage_name, 'has_maths': False, 'total_length': 0, 'spine_length': 0, 'toc_anchor_map': toc_anchor_map(toc), 'landmarks': landmarks, } # Mark the spine as dirty since we have to ensure it is normalized for name in data['spine']: self.parsed(name), self.dirty(name) self.transform_css() self.virtualized_names = set() self.virtualize_resources() def manifest_data(name): mt = (self.mime_map.get(name) or 'application/octet-stream').lower() ans = { 'size':os.path.getsize(self.name_path_map[name]), 'is_virtualized': name in self.virtualized_names, 'mimetype':mt, 'is_html': mt in OEB_DOCS, } if ans['is_html']: root = self.parsed(name) ans['length'] = l = get_length(root) self.book_render_data['total_length'] += l if name in data['spine']: self.book_render_data['spine_length'] += l ans['has_maths'] = hm = check_for_maths(root) if hm: self.book_render_data['has_maths'] = True ans['anchor_map'] = anchor_map(root) return ans data['files'] = {name:manifest_data(name) for name in set(self.name_path_map) - excluded_names} self.commit() for name in excluded_names: os.remove(self.name_path_map[name]) with lopen(os.path.join(self.root, 'calibre-book-manifest.json'), 'wb') as f: f.write(json.dumps(self.book_render_data, ensure_ascii=False).encode('utf-8'))