Example #1
0
def check_link_destinations(container):
    " Check destinations of links that point to HTML files "
    errors = []
    dest_map = {}
    opf_type = guess_type("a.opf")
    ncx_type = guess_type("a.ncx")
    for name, mt in container.mime_map.iteritems():
        if mt in OEB_DOCS:
            for a in container.parsed(name).xpath('//*[local-name()="a" and @href]'):
                href = a.get("href")
                check_link_destination(container, dest_map, name, href, a, errors)
        elif mt == opf_type:
            for a in container.opf_xpath("//opf:reference[@href]"):
                if container.book_type == "azw3" and a.get("type") in {
                    "cover",
                    "other.ms-coverimage-standard",
                    "other.ms-coverimage",
                }:
                    continue
                href = a.get("href")
                check_link_destination(container, dest_map, name, href, a, errors)
        elif mt == ncx_type:
            for a in container.parsed(name).xpath('//*[local-name() = "content" and @src]'):
                href = a.get("src")
                check_link_destination(container, dest_map, name, href, a, errors)

    return errors
Example #2
0
def help_url(item, item_type, doc_name, extra_data=None):
    url = None
    url_maps = ()
    item = item.lower()
    if item_type == 'css_property':
        url_maps = ('css',)
    else:
        mt = guess_type(doc_name)
        if mt in OEB_DOCS:
            url_maps = ('html', 'svg', 'mathml')
        elif mt == guess_type('a.svg'):
            url_maps = ('svg',)
        elif mt == guess_type('a.opf'):
            version = '3' if getattr(extra_data, 'startswith', lambda x: False)('3') else '2'
            url_maps = (('opf' + version),)
        elif mt == guess_type('a.svg'):
            url_maps = ('svg',)
        elif mt == guess_type('a.ncx'):
            url_maps = ('opf2',)

    for umap in url_maps:
        umap = _url_map[umap]
        if item in umap:
            url = umap[item]
            break
        item = item.partition(':')[-1]
        if item and item in umap:
            url = umap[item]
            break

    return url
Example #3
0
 def iterlinks(self, name, get_line_numbers=True):
     ''' Iterate over all links in name. If get_line_numbers is True the
     yields results of the form (link, line_number, offset). Where
     line_number is the line_number at which the link occurs and offset is
     the number of characters from the start of the line. Note that offset
     could actually encompass several lines if not zero. '''
     media_type = self.mime_map.get(name, guess_type(name))
     if name == self.opf_name:
         for elem in self.opf_xpath('//*[@href]'):
             yield (elem.get('href'), elem.sourceline, 0) if get_line_numbers else elem.get('href')
     elif media_type.lower() in OEB_DOCS:
         for el, attr, link, pos in iterlinks(self.parsed(name)):
             yield (link, el.sourceline, pos) if get_line_numbers else link
     elif media_type.lower() in OEB_STYLES:
         if get_line_numbers:
             with self.open(name, 'rb') as f:
                 raw = self.decode(f.read()).replace('\r\n', '\n').replace('\r', '\n')
                 position = PositionFinder(raw)
                 is_in_comment = CommentFinder(raw)
                 for link, offset in itercsslinks(raw):
                     if not is_in_comment(offset):
                         lnum, col = position(offset)
                         yield link, lnum, col
         else:
             for link in getUrls(self.parsed(name)):
                 yield link
     elif media_type.lower() == guess_type('toc.ncx'):
         for elem in self.parsed(name).xpath('//*[@src]'):
             yield (elem.get('src'), elem.sourceline, 0) if get_line_numbers else elem.get('src')
Example #4
0
def create_book(mi, path, fmt='epub', opf_name='metadata.opf', html_name='start.xhtml', toc_name='toc.ncx'):
    ''' Create an empty book in the specified format at the specified location. '''
    path = os.path.abspath(path)
    lang = 'und'
    opf = metadata_to_opf(mi, as_string=False)
    for l in opf.xpath('//*[local-name()="language"]'):
        if l.text:
            lang = l.text
            break
    lang = lang_as_iso639_1(lang) or lang

    opfns = OPF_NAMESPACES['opf']
    m = opf.makeelement('{%s}manifest' % opfns)
    opf.insert(1, m)
    i = m.makeelement('{%s}item' % opfns, href=html_name, id='start')
    i.set('media-type', guess_type('a.xhtml'))
    m.append(i)
    i = m.makeelement('{%s}item' % opfns, href=toc_name, id='ncx')
    i.set('media-type', guess_type(toc_name))
    m.append(i)
    s = opf.makeelement('{%s}spine' % opfns, toc="ncx")
    opf.insert(2, s)
    i = s.makeelement('{%s}itemref' % opfns, idref='start')
    s.append(i)
    CONTAINER = '''\
<?xml version="1.0"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
   <rootfiles>
      <rootfile full-path="{0}" media-type="application/oebps-package+xml"/>
   </rootfiles>
</container>
    '''.format(prepare_string_for_xml(opf_name, True)).encode('utf-8')
    HTML = P('templates/new_book.html', data=True).decode('utf-8').replace(
        '_LANGUAGE_', prepare_string_for_xml(lang, True)
    ).replace(
        '_TITLE_', prepare_string_for_xml(mi.title)
    ).replace(
        '_AUTHORS_', prepare_string_for_xml(authors_to_string(mi.authors))
    ).encode('utf-8')
    h = parse(HTML)
    pretty_html_tree(None, h)
    HTML = serialize(h, 'text/html')
    ncx = etree.tostring(create_toc(mi, opf, html_name, lang), encoding='utf-8', xml_declaration=True, pretty_print=True)
    pretty_xml_tree(opf)
    opf = etree.tostring(opf, encoding='utf-8', xml_declaration=True, pretty_print=True)
    if fmt == 'azw3':
        with TemporaryDirectory('create-azw3') as tdir, CurrentDir(tdir):
            for name, data in ((opf_name, opf), (html_name, HTML), (toc_name, ncx)):
                with open(name, 'wb') as f:
                    f.write(data)
            c = Container(os.path.dirname(os.path.abspath(opf_name)), opf_name, DevNull())
            opf_to_azw3(opf_name, path, c)
    else:
        with ZipFile(path, 'w', compression=ZIP_STORED) as zf:
            zf.writestr('mimetype', b'application/epub+zip', compression=ZIP_STORED)
            zf.writestr('META-INF/', b'', 0755)
            zf.writestr('META-INF/container.xml', CONTAINER)
            zf.writestr(opf_name, opf)
            zf.writestr(html_name, HTML)
            zf.writestr(toc_name, ncx)
Example #5
0
def create_epub(manifest, spine=(), guide=(), meta_cover=None, ver=3):
    mo = []
    for name, data, properties in manifest:
        mo.append('<item id="%s" href="%s" media-type="%s" %s/>' % (
            name, name, guess_type(name), ('properties="%s"' % properties if properties else '')))
    mo = ''.join(mo)
    metadata = ''
    if meta_cover:
        metadata = '<meta name="cover" content="%s"/>' % meta_cover
    if not spine:
        spine = [x[0] for x in manifest if guess_type(x[0]) in OEB_DOCS]
    spine = ''.join('<itemref idref="%s"/>' % name for name in spine)
    guide = ''.join('<reference href="%s" type="%s"/>' % (name, typ) for name, typ in guide)
    opf = OPF_TEMPLATE.format(manifest=mo, ver='%d.0'%ver, metadata=metadata, spine=spine, guide=guide)
    buf = BytesIO()
    with ZipFile(buf, 'w', ZIP_STORED) as zf:
        zf.writestr('META-INF/container.xml', b'''
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
   <rootfiles>
      <rootfile full-path="content.opf" media-type="application/oebps-package+xml"/>
   </rootfiles>
</container>''')
        zf.writestr('content.opf', opf.encode('utf-8'))
        for name, data, properties in manifest:
            if isinstance(data, type('')):
                data = data.encode('utf-8')
            zf.writestr(name, data)
    buf.seek(0)
    return buf
Example #6
0
def check_links(container):
    links_map = defaultdict(set)
    xml_types = {guess_type('a.opf'), guess_type('a.ncx')}
    errors = []
    a = errors.append

    def fl(x):
        x = repr(x)
        if x.startswith('u'):
            x = x[1:]
        return x

    for name, mt in container.mime_map.iteritems():
        if mt in OEB_DOCS or mt in OEB_STYLES or mt in xml_types:
            for href, lnum, col in container.iterlinks(name):
                tname = container.href_to_name(href, name)
                if tname is not None:
                    if container.exists(tname):
                        links_map[name].add(tname)
                    else:
                        a(BadLink(_('The linked resource %s does not exist') % fl(href), name, lnum, col))
                else:
                    purl = urlparse(href)
                    if purl.scheme == 'file':
                        a(FileLink(_('The link %s is a file:// URL') % fl(href), name, lnum, col))
                    elif purl.path and purl.path.startswith('/') and purl.scheme in {'', 'file'}:
                        a(LocalLink(_('The link %s points to a file outside the book') % fl(href), name, lnum, col))

    spine_docs = {name for name, linear in container.spine_names}
    spine_styles = {tname for name in spine_docs for tname in links_map[name] if container.mime_map[tname] in OEB_STYLES}
    num = -1
    while len(spine_styles) > num:
        # Handle import rules in stylesheets
        num = len(spine_styles)
        spine_styles |= {tname for name in spine_styles for tname in links_map[name] if container.mime_map[tname] in OEB_STYLES}
    seen = set(OEB_DOCS) | set(OEB_STYLES)
    spine_resources = {tname for name in spine_docs | spine_styles for tname in links_map[name] if container.mime_map[tname] not in seen}
    unreferenced = set()

    cover_name = container.guide_type_map.get('cover', None)

    for name, mt in container.mime_map.iteritems():
        if mt in OEB_STYLES and name not in spine_styles:
            a(UnreferencedResource(name))
        elif mt in OEB_DOCS and name not in spine_docs:
            a(UnreferencedDoc(name))
        elif (mt in OEB_FONTS or mt.partition('/')[0] in {'image', 'audio', 'video'}) and name not in spine_resources and name != cover_name:
            a(UnreferencedResource(name))
        else:
            continue
        unreferenced.add(name)

    manifest_names = set(container.manifest_id_map.itervalues())
    for name in container.mime_map:
        if name not in container.names_that_need_not_be_manifested and name not in manifest_names:
            a(Unmanifested(name))

    return errors
Example #7
0
    def __init__(self, rootpath, opfpath, log, clone_data=None):
        self.root = clone_data['root'] if clone_data is not None else os.path.abspath(rootpath)
        self.log = log
        self.html_preprocessor = HTMLPreProcessor()
        self.css_preprocessor = CSSPreProcessor()
        self.tweak_mode = False

        self.parsed_cache = {}
        self.mime_map = {}
        self.name_path_map = {}
        self.dirtied = set()
        self.encoding_map = {}
        self.pretty_print = set()
        self.cloned = False
        self.cache_names = ('parsed_cache', 'mime_map', 'name_path_map', 'encoding_map', 'dirtied', 'pretty_print')

        if clone_data is not None:
            self.cloned = True
            for x in ('name_path_map', 'opf_name', 'mime_map', 'pretty_print', 'encoding_map', 'tweak_mode'):
                setattr(self, x, clone_data[x])
            self.opf_dir = os.path.dirname(self.name_path_map[self.opf_name])
            return

        # Map of relative paths with '/' separators from root of unzipped ePub
        # to absolute paths on filesystem with os-specific separators
        opfpath = os.path.abspath(os.path.realpath(opfpath))
        for dirpath, _dirnames, filenames in os.walk(self.root):
            for f in filenames:
                path = join(dirpath, f)
                name = self.abspath_to_name(path)
                if isosx:
                    # OS X silently changes all file names to NFD form. The
                    # EPUB spec requires all text including filenames to be in
                    # NFC form. The proper fix is to implement a VFS that maps
                    # between canonical names and their filesystem
                    # representation, however, I dont have the time for that
                    # now, so this will at least fix the problem for books that
                    # properly use the NFC form. Books that use the NFD form
                    # will be broken by this, but that's the price you pay for
                    # using OS X.
                    name = unicodedata.normalize('NFC', name)
                self.name_path_map[name] = path
                self.mime_map[name] = guess_type(path)
                # Special case if we have stumbled onto the opf
                if path == opfpath:
                    self.opf_name = name
                    self.opf_dir = os.path.dirname(path)
                    self.mime_map[name] = guess_type('a.opf')

        if not hasattr(self, 'opf_name'):
            raise InvalidBook('Could not locate opf file: %r'%opfpath)

        # Update mime map with data from the OPF
        self.refresh_mime_map()
Example #8
0
    def __init__(self, rootpath, opfpath, log, clone_data=None):
        self.root = clone_data['root'] if clone_data is not None else os.path.abspath(rootpath)
        self.log = log
        self.html_preprocessor = HTMLPreProcessor()
        self.css_preprocessor = CSSPreProcessor()
        self.tweak_mode = False

        self.parsed_cache = {}
        self.mime_map = {}
        self.name_path_map = {}
        self.dirtied = set()
        self.encoding_map = {}
        self.pretty_print = set()
        self.cloned = False
        self.cache_names = ('parsed_cache', 'mime_map', 'name_path_map', 'encoding_map', 'dirtied', 'pretty_print')

        if clone_data is not None:
            self.cloned = True
            for x in ('name_path_map', 'opf_name', 'mime_map', 'pretty_print', 'encoding_map', 'tweak_mode'):
                setattr(self, x, clone_data[x])
            self.opf_dir = os.path.dirname(self.name_path_map[self.opf_name])
            return

        # Map of relative paths with '/' separators from root of unzipped ePub
        # to absolute paths on filesystem with os-specific separators
        opfpath = os.path.abspath(os.path.realpath(opfpath))
        for dirpath, _dirnames, filenames in os.walk(self.root):
            for f in filenames:
                path = join(dirpath, f)
                name = self.abspath_to_name(path)
                # OS X silently changes all file names to NFD form. The EPUB
                # spec requires all text including filenames to be in NFC form.
                # The proper fix is to implement a VFS that maps between
                # canonical names and their file system representation, however,
                # I dont have the time for that now. Note that the container
                # ensures that all text files are normalized to NFC when
                # decoding them anyway, so there should be no mismatch between
                # names in the text and NFC canonical file names.
                name = unicodedata.normalize('NFC', name)
                self.name_path_map[name] = path
                self.mime_map[name] = guess_type(path)
                # Special case if we have stumbled onto the opf
                if path == opfpath:
                    self.opf_name = name
                    self.opf_dir = os.path.dirname(path)
                    self.mime_map[name] = guess_type('a.opf')

        if not hasattr(self, 'opf_name'):
            raise InvalidBook('Could not locate opf file: %r'%opfpath)

        # Update mime map with data from the OPF
        self.refresh_mime_map()
Example #9
0
def download_one(tdir, timeout, progress_report, data_uri_map, url):
    try:
        purl = urlparse(url)
        data_url_key = None
        with NamedTemporaryFile(dir=tdir, delete=False) as df:
            if purl.scheme == 'file':
                src = lopen(purl.path, 'rb')
                filename = os.path.basename(src)
                sz = (src.seek(0, os.SEEK_END), src.tell(), src.seek(0))[1]
            elif purl.scheme == 'data':
                prefix, payload = purl.path.split(',', 1)
                parts = prefix.split(';')
                if parts and parts[-1].lower() == 'base64':
                    payload = re.sub(r'\s+', '', payload)
                    payload = standard_b64decode(payload)
                else:
                    payload = payload.encode('utf-8')
                seen_before = data_uri_map.get(payload)
                if seen_before is not None:
                    return True, (url, filename, seen_before, guess_type(seen_before))
                data_url_key = payload
                src = BytesIO(payload)
                sz = len(payload)
                ext = 'unknown'
                for x in parts:
                    if '=' not in x and '/' in x:
                        exts = mimetypes.guess_all_extensions(x)
                        if exts:
                            ext = exts[0]
                            break
                filename = 'data-uri.' + ext
            else:
                src = urlopen(url, timeout=timeout)
                filename = get_filename(purl, src)
                sz = get_content_length(src)
            progress_report(url, 0, sz)
            dest = ProgressTracker(df, url, sz, progress_report)
            with closing(src):
                shutil.copyfileobj(src, dest)
            if data_url_key is not None:
                data_uri_map[data_url_key] = dest.name
            filename = sanitize_file_name(filename)
            mt = guess_type(filename)
            if mt in OEB_DOCS:
                raise ValueError('The external resource {} looks like a HTML document ({})'.format(url, filename))
            if not mt or mt == 'application/octet-stream' or '.' not in filename:
                raise ValueError('The external resource {} is not of a known type'.format(url))
            return True, (url, filename, dest.name, mt)
    except Exception as err:
        return False, (url, as_unicode(err))
Example #10
0
def download_one(tdir, timeout, progress_report, url):
    try:
        purl = urlparse(url)
        with NamedTemporaryFile(dir=tdir, delete=False) as df:
            if purl.scheme == 'file':
                src = lopen(purl.path, 'rb')
                filename = os.path.basename(src)
                sz = (src.seek(0, os.SEEK_END), src.tell(), src.seek(0))[1]
            else:
                src = urlopen(url, timeout=timeout)
                filename = get_filename(purl, src)
                sz = get_content_length(src)
            progress_report(url, 0, sz)
            dest = ProgressTracker(df, url, sz, progress_report)
            with closing(src):
                shutil.copyfileobj(src, dest)
            filename = sanitize_file_name2(filename)
            mt = guess_type(filename)
            if mt in OEB_DOCS:
                raise ValueError('The external resource {} looks like a HTML document ({})'.format(url, filename))
            if not mt or mt == 'application/octet-stream' or '.' not in filename:
                raise ValueError('The external resource {} is not of a known type'.format(url))
            return True, (url, sanitize_file_name2(filename), dest.name, mt)
    except Exception as err:
        return False, (url, as_unicode(err))
Example #11
0
def get_recommended_folders(container, names):
    ''' Return the folders that are recommended for the given filenames. The
    recommendation is based on where the majority of files of the same type are
    located in the container. If no files of a particular type are present, the
    recommended folder is assumed to be the folder containing the OPF file. '''
    from calibre.ebooks.oeb.polish.utils import guess_type
    counts = defaultdict(Counter)
    for name, mt in container.mime_map.iteritems():
        folder = name.rpartition('/')[0] if '/' in name else ''
        counts[mt_to_category(container, mt)][folder] += 1

    try:
        opf_folder = counts['opf'].most_common(1)[0][0]
    except KeyError:
        opf_folder = ''

    recommendations = {
        category: counter.most_common(1)[0][0]
        for category, counter in counts.iteritems()
    }
    return {
        n: recommendations.get(
            mt_to_category(container, guess_type(os.path.basename(n))),
            opf_folder)
        for n in names
    }
Example #12
0
def get_decoded_raw(name):
    from calibre.ebooks.chardet import xml_to_unicode, force_encoding
    with open(name, 'rb') as f:
        raw = f.read()
    syntax = syntax_from_mime(name, guess_type(name))
    if syntax is None:
        try:
            raw = raw.decode('utf-8')
        except ValueError:
            pass
    elif syntax != 'raster_image':
        if syntax in {'html', 'xml'}:
            raw = xml_to_unicode(raw, verbose=True)[0]
        else:
            m = re.search(br"coding[:=]\s*([-\w.]+)", raw[:1024], flags=re.I)
            if m is not None and m.group(1) != '8bit':
                enc = m.group(1)
                if enc == b'unicode':
                    enc = 'utf-8'
            else:
                enc = force_encoding(raw, verbose=True)
            try:
                raw = raw.decode(enc)
            except (LookupError, ValueError):
                try:
                    raw = raw.decode('utf-8')
                except ValueError:
                    pass
    return raw, syntax
Example #13
0
 def guess_type(self, name):
     # epubcheck complains if the mimetype for text documents is set to
     # text/html in EPUB 2 books. Sigh.
     ans = guess_type(name)
     if ans == 'text/html':
         ans = 'application/xhtml+xml'
     return ans
Example #14
0
 def __init__(self, path_to_ebook, tdir, log=None):
     log = log or default_log
     book_fmt, opfpath, input_fmt = extract_book(path_to_ebook, tdir, log=log)
     ContainerBase.__init__(self, tdir, opfpath, log)
     excluded_names = {
         name for name, mt in self.mime_map.iteritems() if
         name == self.opf_name or mt == guess_type('a.ncx') or name.startswith('META-INF/')
     }
     self.book_render_data = data = {
         'version': self.RENDER_VERSION,
         'toc':get_toc(self).as_dict,
         'spine':[name for name, is_linear in self.spine_names],
         'link_uid': uuid4(),
         'is_comic': input_fmt.lower() in {'cbc', 'cbz', 'cbr', 'cb7'},
         'manifest': list(set(self.name_path_map) - excluded_names),
     }
     # Mark the spine as dirty since we have to ensure it is normalized
     for name in data['spine']:
         self.parsed(name), self.dirty(name)
     self.virtualize_resources()
     self.commit()
     for name in excluded_names:
         os.remove(self.name_path_map[name])
     with lopen(os.path.join(self.root, 'calibre-book-manifest.json'), 'wb') as f:
         f.write(json.dumps(self.book_render_data, ensure_ascii=False).encode('utf-8'))
Example #15
0
    def manifest_key(x):
        mt = x.get('media-type', '')
        href = x.get('href', '')
        ext = href.rpartition('.')[-1].lower()
        cat = 1000
        if mt in OEB_DOCS:
            cat = 0
        elif mt == guess_type('a.ncx'):
            cat = 1
        elif mt in OEB_STYLES:
            cat = 2
        elif mt.startswith('image/'):
            cat = 3
        elif ext in {'otf', 'ttf', 'woff'}:
            cat = 4
        elif mt.startswith('audio/'):
            cat = 5
        elif mt.startswith('video/'):
            cat = 6

        if cat == 0:
            i = spine_ids.get(x.get('id', None), 1000000000)
        else:
            i = sort_key(href)
        return (cat, i)
Example #16
0
 def __init__(self, name, lnum, bad_idref=None, bad_mimetype=None):
     if bad_idref is not None:
         msg = _('The item identified as the Table of Contents (%s) does not exist') % bad_idref
         self.HELP = _('There is no item with id="%s" in the manifest.') % bad_idref
     else:
         msg = _('The item identified as the Table of Contents has an incorrect media-type (%s)') % bad_mimetype
         self.HELP = _('The media type for the table of contents must be %s') % guess_type('a.ncx')
     BaseError.__init__(self, msg, name, lnum)
Example #17
0
def mt_to_category(container, mt):
    from calibre.ebooks.oeb.polish.utils import guess_type
    from calibre.ebooks.oeb.polish.container import OEB_FONTS
    from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES
    if mt in OEB_DOCS:
        category = 'text'
    elif mt in OEB_STYLES:
        category = 'style'
    elif mt in OEB_FONTS:
        category = 'font'
    elif mt == guess_type('a.opf'):
        category = 'opf'
    elif mt == guess_type('a.ncx'):
        category = 'toc'
    else:
        category = mt.partition('/')[0]
    return category
Example #18
0
def find_existing_ncx_toc(container):
    toc = container.opf_xpath('//opf:spine/@toc')
    if toc:
        toc = container.manifest_id_map.get(toc[0], None)
    if not toc:
        ncx = guess_type('a.ncx')
        toc = container.manifest_type_map.get(ncx, [None])[0]
    return toc or None
Example #19
0
 def parsed(self, name):
     ans = self.parsed_cache.get(name, None)
     if ans is None:
         self.used_encoding = None
         mime = self.mime_map.get(name, guess_type(name))
         ans = self.parse(self.name_path_map[name], mime)
         self.parsed_cache[name] = ans
         self.encoding_map[name] = self.used_encoding
     return ans
Example #20
0
def mt_to_category(container, mt):
    from calibre.ebooks.oeb.polish.utils import guess_type
    from calibre.ebooks.oeb.polish.container import OEB_FONTS
    from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES

    if mt in OEB_DOCS:
        category = "text"
    elif mt in OEB_STYLES:
        category = "style"
    elif mt in OEB_FONTS:
        category = "font"
    elif mt == guess_type("a.opf"):
        category = "opf"
    elif mt == guess_type("a.ncx"):
        category = "toc"
    else:
        category = mt.partition("/")[0]
    return category
Example #21
0
def check_ids(container):
    errors = []
    mts = set(OEB_DOCS) | {guess_type("a.opf"), guess_type("a.ncx")}
    for name, mt in container.mime_map.iteritems():
        if mt in mts:
            root = container.parsed(name)
            seen_ids = {}
            dups = {}
            for elem in root.xpath("//*[@id]"):
                eid = elem.get("id")
                if eid in seen_ids:
                    if eid not in dups:
                        dups[eid] = [seen_ids[eid]]
                    dups[eid].append(elem.sourceline)
                else:
                    seen_ids[eid] = elem.sourceline
            errors.extend(DuplicateId(name, eid, locs) for eid, locs in dups.iteritems())
    return errors
Example #22
0
    def __init__(self, rootpath, opfpath, log, clone_data=None):
        self.root = clone_data['root'] if clone_data is not None else os.path.abspath(rootpath)
        self.log = log
        self.html_preprocessor = HTMLPreProcessor()
        self.css_preprocessor = CSSPreProcessor()
        self.tweak_mode = False

        self.parsed_cache = {}
        self.mime_map = {}
        self.name_path_map = {}
        self.dirtied = set()
        self.encoding_map = {}
        self.pretty_print = set()
        self.cloned = False
        self.cache_names = ('parsed_cache', 'mime_map', 'name_path_map', 'encoding_map', 'dirtied', 'pretty_print')

        if clone_data is not None:
            self.cloned = True
            for x in ('name_path_map', 'opf_name', 'mime_map', 'pretty_print', 'encoding_map', 'tweak_mode'):
                setattr(self, x, clone_data[x])
            self.opf_dir = os.path.dirname(self.name_path_map[self.opf_name])
            return

        # Map of relative paths with '/' separators from root of unzipped ePub
        # to absolute paths on filesystem with os-specific separators
        opfpath = os.path.abspath(os.path.realpath(opfpath))
        for dirpath, _dirnames, filenames in os.walk(self.root):
            for f in filenames:
                path = join(dirpath, f)
                name = self.abspath_to_name(path)
                self.name_path_map[name] = path
                self.mime_map[name] = guess_type(path)
                # Special case if we have stumbled onto the opf
                if path == opfpath:
                    self.opf_name = name
                    self.opf_dir = os.path.dirname(path)
                    self.mime_map[name] = guess_type('a.opf')

        if not hasattr(self, 'opf_name'):
            raise InvalidBook('Could not locate opf file: %r'%opfpath)

        # Update mime map with data from the OPF
        self.refresh_mime_map()
Example #23
0
def find_existing_toc(container):
    toc = container.opf_xpath("//opf:spine/@toc")
    if toc:
        toc = container.manifest_id_map.get(toc[0], None)
    if not toc:
        ncx = guess_type("a.ncx")
        toc = container.manifest_type_map.get(ncx, [None])[0]
    if not toc:
        return None
    return toc
Example #24
0
def pretty_all(container):
    for name, mt in container.mime_map.iteritems():
        prettied = False
        if mt in OEB_DOCS:
            pretty_html_tree(container, container.parsed(name))
            prettied = True
        elif mt in OEB_STYLES:
            container.parsed(name)
            prettied = True
        elif name == container.opf_name:
            root = container.parsed(name)
            pretty_opf(root)
            pretty_xml_tree(root)
            prettied = True
        elif mt in {guess_type('a.ncx'), guess_type('a.xml')}:
            pretty_xml_tree(container.parsed(name))
            prettied = True
        if prettied:
            container.dirty(name)
Example #25
0
def get_recommended_folders(container, names):
    ' Return the folders that are recommended for the given filenames '
    from calibre.ebooks.oeb.polish.utils import guess_type
    counts = defaultdict(Counter)
    for name, mt in container.mime_map.iteritems():
        folder = name.rpartition('/')[0] if '/' in name else ''
        counts[mt_to_category(container, mt)][folder] += 1

    recommendations = {category:counter.most_common(1)[0][0] for category, counter in counts.iteritems()}
    return {n:recommendations.get(mt_to_category(container, guess_type(os.path.basename(n))), '') for n in names}
Example #26
0
    def __init__(self, path_to_ebook, tdir, log=None, book_hash=None):
        log = log or default_log
        book_fmt, opfpath, input_fmt = extract_book(path_to_ebook, tdir, log=log)
        ContainerBase.__init__(self, tdir, opfpath, log)
        excluded_names = {
            name for name, mt in self.mime_map.iteritems() if
            name == self.opf_name or mt == guess_type('a.ncx') or name.startswith('META-INF/') or
            name == 'mimetype'
        }
        raster_cover_name, titlepage_name = self.create_cover_page(input_fmt.lower())

        self.book_render_data = data = {
            'version': RENDER_VERSION,
            'toc':get_toc(self).as_dict,
            'spine':[name for name, is_linear in self.spine_names],
            'link_uid': uuid4(),
            'book_hash': book_hash,
            'is_comic': input_fmt.lower() in {'cbc', 'cbz', 'cbr', 'cb7'},
            'raster_cover_name': raster_cover_name,
            'title_page_name': titlepage_name,
            'has_maths': False,
            'total_length': 0,
            'spine_length': 0,
        }
        # Mark the spine as dirty since we have to ensure it is normalized
        for name in data['spine']:
            self.parsed(name), self.dirty(name)
        self.transform_css()
        self.virtualized_names = set()
        self.virtualize_resources()

        def manifest_data(name):
            mt = (self.mime_map.get(name) or 'application/octet-stream').lower()
            ans = {
                'size':os.path.getsize(self.name_path_map[name]),
                'is_virtualized': name in self.virtualized_names,
                'mimetype':mt,
                'is_html': mt in OEB_DOCS,
            }
            if ans['is_html']:
                root = self.parsed(name)
                ans['length'] = l = get_length(root)
                self.book_render_data['total_length'] += l
                if name in data['spine']:
                    self.book_render_data['spine_length'] += l
                ans['has_maths'] = hm = check_for_maths(root)
                if hm:
                    self.book_render_data['has_maths'] = True
            return ans
        data['files'] = {name:manifest_data(name) for name in set(self.name_path_map) - excluded_names}
        self.commit()
        for name in excluded_names:
            os.remove(self.name_path_map[name])
        with lopen(os.path.join(self.root, 'calibre-book-manifest.json'), 'wb') as f:
            f.write(json.dumps(self.book_render_data, ensure_ascii=False).encode('utf-8'))
Example #27
0
def check_ids(container):
    errors = []
    mts = set(OEB_DOCS) | {guess_type('a.opf'), guess_type('a.ncx')}
    for name, mt in iteritems(container.mime_map):
        if mt in mts:
            root = container.parsed(name)
            seen_ids = {}
            dups = {}
            for elem in root.xpath('//*[@id]'):
                eid = elem.get('id')
                if eid in seen_ids:
                    if eid not in dups:
                        dups[eid] = [seen_ids[eid]]
                    dups[eid].append(elem.sourceline)
                else:
                    seen_ids[eid] = elem.sourceline
                if eid and valid_id.match(eid) is None:
                    errors.append(InvalidId(name, elem.sourceline, eid))
            errors.extend(DuplicateId(name, eid, locs) for eid, locs in iteritems(dups))
    return errors
Example #28
0
def get_filename(original_url_parsed, response):
    ans = get_download_filename_from_response(response) or posixpath.basename(original_url_parsed.path) or 'unknown'
    ct = response.info().get('Content-Type', '')
    if ct:
        ct = cgi.parse_header(ct)[0].lower()
        if ct:
            mt = guess_type(ans)
            if mt != ct:
                exts = mimetypes.guess_all_extensions(ct)
                if exts:
                    ans += exts[0]
    return ans
Example #29
0
def pretty_all(container):
    ' Pretty print all HTML/CSS/XML files in the container '
    xml_types = {guess_type('a.ncx'), guess_type('a.xml'), guess_type('a.svg')}
    for name, mt in iteritems(container.mime_map):
        prettied = False
        if mt in OEB_DOCS:
            pretty_html_tree(container, container.parsed(name))
            prettied = True
        elif mt in OEB_STYLES:
            container.parsed(name)
            prettied = True
        elif name == container.opf_name:
            root = container.parsed(name)
            pretty_opf(root)
            pretty_xml_tree(root)
            prettied = True
        elif mt in xml_types:
            pretty_xml_tree(container.parsed(name))
            prettied = True
        if prettied:
            container.dirty(name)
Example #30
0
    def replace_links(self, name, replace_func):
        ''' Replace all links in name using replace_func, which must be a
        callable that accepts a URL and returns the replaced URL. It must also
        have a 'replaced' attribute that is set to True if any actual
        replacement is done. Convenient ways of creating such callables are
        using the :class:`LinkReplacer` and :class:`LinkRebaser` classes. '''
        media_type = self.mime_map.get(name, guess_type(name))
        if name == self.opf_name:
            for elem in self.opf_xpath('//*[@href]'):
                elem.set('href', replace_func(elem.get('href')))
        elif media_type.lower() in OEB_DOCS:
            rewrite_links(self.parsed(name), replace_func)
        elif media_type.lower() in OEB_STYLES:
            replaceUrls(self.parsed(name), replace_func)
        elif media_type.lower() == guess_type('toc.ncx'):
            for elem in self.parsed(name).xpath('//*[@src]'):
                elem.set('src', replace_func(elem.get('src')))

        if replace_func.replaced:
            self.dirty(name)
        return replace_func.replaced
Example #31
0
    def requestStarted(self, rq):
        if bytes(rq.requestMethod()) != b'GET':
            return self.fail_request(rq, rq.RequestDenied)
        url = rq.requestUrl()
        host = url.host()
        if host not in self.allowed_hosts or url.scheme() != FAKE_PROTOCOL:
            return self.fail_request(rq)
        name = url.path()[1:]
        if host == SANDBOX_HOST and not name.startswith('book/'):
            return self.fail_request(rq)
        if name.startswith('book/'):
            name = name.partition('/')[2]
            if name == '__index__':
                send_reply(rq, 'text/html', b'<div>\xa0</div>')
                return
            elif name == '__popup__':
                send_reply(
                    rq, 'text/html',
                    b'<div id="calibre-viewer-footnote-iframe">\xa0</div>')
                return
            try:
                data, mime_type = get_data(name)
                if data is None:
                    rq.fail(rq.UrlNotFound)
                    return
                data = as_bytes(data)
                mime_type = {
                    # Prevent warning in console about mimetype of fonts
                    'application/vnd.ms-opentype': 'application/x-font-ttf',
                    'application/x-font-truetype': 'application/x-font-ttf',
                    'application/font-sfnt': 'application/x-font-ttf',
                }.get(mime_type, mime_type)
                send_reply(rq, mime_type, data)
            except Exception:
                import traceback
                traceback.print_exc()
                return self.fail_request(rq, rq.RequestFailed)
        elif name == 'manifest':
            data = b'[' + set_book_path.manifest + b',' + set_book_path.metadata + b']'
            send_reply(rq, set_book_path.manifest_mime, data)
        elif name == 'reader-background':
            mt, data = background_image()
            if data:
                send_reply(rq, mt, data)
            else:
                rq.fail(rq.UrlNotFound)
        elif name.startswith('mathjax/'):
            from calibre.gui2.viewer.mathjax import monkeypatch_mathjax
            if name == 'mathjax/manifest.json':
                if self.mathjax_manifest is None:
                    import json
                    from calibre.srv.books import get_mathjax_manifest
                    self.mathjax_manifest = as_bytes(
                        json.dumps(get_mathjax_manifest()['files']))
                send_reply(rq, 'application/json', self.mathjax_manifest)
                return
            path = os.path.abspath(os.path.join(self.mathjax_dir, '..', name))
            if path.startswith(self.mathjax_dir):
                mt = guess_type(name)
                try:
                    with lopen(path, 'rb') as f:
                        raw = f.read()
                except EnvironmentError as err:
                    prints(
                        "Failed to get mathjax file: {} with error: {}".format(
                            name, err))
                    return self.fail_request(rq, rq.RequestFailed)
                if 'MathJax.js' in name:
                    # raw = open(os.path.expanduser('~/work/mathjax/unpacked/MathJax.js')).read()
                    raw = monkeypatch_mathjax(
                        raw.decode('utf-8')).encode('utf-8')

                send_reply(rq, mt, raw)
        elif not name:
            send_reply(rq, 'text/html', viewer_html())
        else:
            return self.fail_request(rq)
Example #32
0
 def raw_data(self, name, decode=True):
     ans = self.open(name).read()
     mime = self.mime_map.get(name, guess_type(name))
     if decode and (mime in OEB_STYLES or mime in OEB_DOCS or mime == 'text/plain' or mime[-4:] in {'+xml', '/xml'}):
         ans = self.decode(ans)
     return ans
Example #33
0
def process_exploded_book(book_fmt,
                          opfpath,
                          input_fmt,
                          tdir,
                          render_manager,
                          log=None,
                          book_hash=None,
                          save_bookmark_data=False,
                          book_metadata=None,
                          virtualize_resources=True):
    log = log or default_log
    container = SimpleContainer(tdir, opfpath, log)
    input_plugin = plugin_for_input_format(input_fmt)
    is_comic = bool(getattr(input_plugin, 'is_image_collection', False))

    def needs_work(mt):
        return mt in OEB_STYLES or mt in OEB_DOCS or mt == 'image/svg+xml'

    def work_priority(name):
        # ensure workers with large files or stylesheets
        # have the less names
        size = os.path.getsize(container.name_path_map[name]),
        is_html = container.mime_map.get(name) in OEB_DOCS
        return (0 if is_html else 1), size

    if not is_comic:
        render_manager.launch_workers(
            tuple(n for n, mt in iteritems(container.mime_map)
                  if needs_work(mt)), container)

    bookmark_data = None
    if save_bookmark_data:
        bm_file = 'META-INF/calibre_bookmarks.txt'
        if container.exists(bm_file):
            with container.open(bm_file, 'rb') as f:
                bookmark_data = f.read()

    # We do not add zero byte sized files as the IndexedDB API in the
    # browser has no good way to distinguish between zero byte files and
    # load failures.
    excluded_names = {
        name
        for name, mt in iteritems(container.mime_map)
        if name == container.opf_name or mt == guess_type('a.ncx')
        or name.startswith('META-INF/') or name == 'mimetype'
        or not container.has_name_and_is_not_empty(name)
    }
    raster_cover_name, titlepage_name = create_cover_page(
        container, input_fmt.lower(), is_comic, book_metadata)

    toc = get_toc(container, verify_destinations=False).to_dict(count())
    if not toc or not toc.get('children'):
        toc = from_xpaths(container,
                          ['//h:h1', '//h:h2', '//h:h3']).to_dict(count())
    spine = [name for name, is_linear in container.spine_names]
    spineq = frozenset(spine)
    landmarks = [l for l in get_landmarks(container) if l['dest'] in spineq]

    book_render_data = {
        'version': RENDER_VERSION,
        'toc': toc,
        'book_format': book_fmt,
        'spine': spine,
        'link_uid': uuid4(),
        'book_hash': book_hash,
        'is_comic': is_comic,
        'raster_cover_name': raster_cover_name,
        'title_page_name': titlepage_name,
        'has_maths': False,
        'total_length': 0,
        'spine_length': 0,
        'toc_anchor_map': toc_anchor_map(toc),
        'landmarks': landmarks,
        'link_to_map': {},
    }

    names = sorted(
        (n for n, mt in iteritems(container.mime_map) if needs_work(mt)),
        key=work_priority)

    results = render_manager(
        names,
        (tdir, opfpath, virtualize_resources, book_render_data['link_uid'],
         container.data_for_clone()), container)
    ltm = book_render_data['link_to_map']
    html_data = {}
    virtualized_names = set()

    def merge_ltm(dest, src):
        for k, v in iteritems(src):
            if k in dest:
                dest[k] |= v
            else:
                dest[k] = v

    for link_to_map, hdata, vnames in results:
        html_data.update(hdata)
        virtualized_names |= vnames
        for k, v in iteritems(link_to_map):
            if k in ltm:
                merge_ltm(ltm[k], v)
            else:
                ltm[k] = v

    def manifest_data(name):
        mt = (container.mime_map.get(name)
              or 'application/octet-stream').lower()
        ans = {
            'size': os.path.getsize(container.name_path_map[name]),
            'is_virtualized': name in virtualized_names,
            'mimetype': mt,
            'is_html': mt in OEB_DOCS,
        }
        if ans['is_html']:
            data = html_data[name]
            ans['length'] = l = data['length']
            book_render_data['total_length'] += l
            if name in book_render_data['spine']:
                book_render_data['spine_length'] += l
            ans['has_maths'] = hm = data['has_maths']
            if hm:
                book_render_data['has_maths'] = True
            ans['anchor_map'] = data['anchor_map']
        return ans

    book_render_data['files'] = {
        name: manifest_data(name)
        for name in set(container.name_path_map) - excluded_names
    }
    container.commit()

    for name in excluded_names:
        os.remove(container.name_path_map[name])

    ltm = book_render_data['link_to_map']
    for name, amap in iteritems(ltm):
        for k, v in tuple(iteritems(amap)):
            amap[k] = tuple(v)  # needed for JSON serialization

    data = as_bytes(json.dumps(book_render_data, ensure_ascii=False))
    with lopen(os.path.join(container.root, 'calibre-book-manifest.json'),
               'wb') as f:
        f.write(data)

    return container, bookmark_data
Example #34
0
 def image_names(self):
     img_types = {guess_type('a.' + x) for x in ('png', 'jpeg', 'gif')}
     for name, mt in iteritems(self.container.mime_map):
         if mt.lower() in img_types:
             yield name
Example #35
0
def create_book(mi, path, fmt='epub', opf_name='metadata.opf', html_name='start.xhtml', toc_name='toc.ncx'):
    ''' Create an empty book in the specified format at the specified location. '''
    path = os.path.abspath(path)
    lang = 'und'
    opf = metadata_to_opf(mi, as_string=False)
    for l in opf.xpath('//*[local-name()="language"]'):
        if l.text:
            lang = l.text
            break
    lang = lang_as_iso639_1(lang) or lang

    opfns = OPF_NAMESPACES['opf']
    m = opf.makeelement('{%s}manifest' % opfns)
    opf.insert(1, m)
    i = m.makeelement('{%s}item' % opfns, href=html_name, id='start')
    i.set('media-type', guess_type('a.xhtml'))
    m.append(i)
    i = m.makeelement('{%s}item' % opfns, href=toc_name, id='ncx')
    i.set('media-type', guess_type(toc_name))
    m.append(i)
    s = opf.makeelement('{%s}spine' % opfns, toc="ncx")
    opf.insert(2, s)
    i = s.makeelement('{%s}itemref' % opfns, idref='start')
    s.append(i)
    CONTAINER = '''\
<?xml version="1.0"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
   <rootfiles>
      <rootfile full-path="{0}" media-type="application/oebps-package+xml"/>
   </rootfiles>
</container>
    '''.format(prepare_string_for_xml(opf_name, True)).encode('utf-8')
    HTML = '''\
<?xml version='1.0' encoding='utf-8'?>
<html lang="{1}" xmlns="http://www.w3.org/1999/xhtml">

    <head>
        <title>{0}</title>
    </head>

    <body>
        <h1>{0}</h1>
    </body>
</html>
    '''.format(prepare_string_for_xml(mi.title), lang).encode('utf-8')
    h = parse(HTML)
    pretty_html_tree(None, h)
    HTML = serialize(h, 'text/html')
    ncx = etree.tostring(create_toc(mi, opf, html_name, lang), encoding='utf-8', xml_declaration=True, pretty_print=True)
    pretty_xml_tree(opf)
    opf = etree.tostring(opf, encoding='utf-8', xml_declaration=True, pretty_print=True)
    if fmt == 'azw3':
        with TemporaryDirectory('create-azw3') as tdir, CurrentDir(tdir):
            for name, data in ((opf_name, opf), (html_name, HTML), (toc_name, ncx)):
                with open(name, 'wb') as f:
                    f.write(data)
            c = Container(os.path.dirname(os.path.abspath(opf_name)), opf_name, DevNull())
            opf_to_azw3(opf_name, path, c)
    else:
        with ZipFile(path, 'w', compression=ZIP_STORED) as zf:
            zf.writestr('mimetype', b'application/epub+zip', compression=ZIP_STORED)
            zf.writestr('META-INF/', b'', 0755)
            zf.writestr('META-INF/container.xml', CONTAINER)
            zf.writestr(opf_name, opf)
            zf.writestr(html_name, HTML)
            zf.writestr(toc_name, ncx)
Example #36
0
    def __init__(self,
                 book_fmt,
                 opfpath,
                 input_fmt,
                 tdir,
                 log=None,
                 book_hash=None,
                 save_bookmark_data=False,
                 book_metadata=None,
                 allow_no_cover=True,
                 virtualize_resources=True):
        log = log or default_log
        self.allow_no_cover = allow_no_cover
        ContainerBase.__init__(self, tdir, opfpath, log)
        self.book_metadata = book_metadata
        input_plugin = plugin_for_input_format(input_fmt)
        self.is_comic = bool(
            getattr(input_plugin, 'is_image_collection', False))
        if save_bookmark_data:
            bm_file = 'META-INF/calibre_bookmarks.txt'
            self.bookmark_data = None
            if self.exists(bm_file):
                with self.open(bm_file, 'rb') as f:
                    self.bookmark_data = f.read()
        # We do not add zero byte sized files as the IndexedDB API in the
        # browser has no good way to distinguish between zero byte files and
        # load failures.
        excluded_names = {
            name
            for name, mt in iteritems(self.mime_map) if name == self.opf_name
            or mt == guess_type('a.ncx') or name.startswith('META-INF/')
            or name == 'mimetype' or not self.has_name_and_is_not_empty(name)
        }
        raster_cover_name, titlepage_name = self.create_cover_page(
            input_fmt.lower())

        toc = get_toc(self).to_dict(count())
        if not toc or not toc.get('children'):
            toc = from_xpaths(self,
                              ['//h:h1', '//h:h2', '//h:h3']).to_dict(count())
        spine = [name for name, is_linear in self.spine_names]
        spineq = frozenset(spine)
        landmarks = [l for l in get_landmarks(self) if l['dest'] in spineq]

        self.book_render_data = data = {
            'version': RENDER_VERSION,
            'toc': toc,
            'book_format': book_fmt,
            'spine': spine,
            'link_uid': uuid4(),
            'book_hash': book_hash,
            'is_comic': self.is_comic,
            'raster_cover_name': raster_cover_name,
            'title_page_name': titlepage_name,
            'has_maths': False,
            'total_length': 0,
            'spine_length': 0,
            'toc_anchor_map': toc_anchor_map(toc),
            'landmarks': landmarks,
            'link_to_map': {},
        }
        # Mark the spine as dirty since we have to ensure it is normalized
        for name in data['spine']:
            self.parsed(name), self.dirty(name)
        self.virtualized_names = set()
        self.transform_all(virtualize_resources)

        def manifest_data(name):
            mt = (self.mime_map.get(name)
                  or 'application/octet-stream').lower()
            ans = {
                'size': os.path.getsize(self.name_path_map[name]),
                'is_virtualized': name in self.virtualized_names,
                'mimetype': mt,
                'is_html': mt in OEB_DOCS,
            }
            if ans['is_html']:
                root = self.parsed(name)
                ans['length'] = l = get_length(root)
                self.book_render_data['total_length'] += l
                if name in data['spine']:
                    self.book_render_data['spine_length'] += l
                ans['has_maths'] = hm = check_for_maths(root)
                if hm:
                    self.book_render_data['has_maths'] = True
                ans['anchor_map'] = anchor_map(root)
            return ans

        data['files'] = {
            name: manifest_data(name)
            for name in set(self.name_path_map) - excluded_names
        }
        self.commit()
        for name in excluded_names:
            os.remove(self.name_path_map[name])
        data = json.dumps(self.book_render_data, ensure_ascii=False)
        if not isinstance(data, bytes):
            data = data.encode('utf-8')
        with lopen(os.path.join(self.root, 'calibre-book-manifest.json'),
                   'wb') as f:
            f.write(data)
Example #37
0
def subset_all_fonts(container, font_stats, report):
    remove = set()
    total_old = total_new = 0
    changed = False
    for name, mt in container.mime_map.iteritems():
        if (mt in OEB_FONTS or name.rpartition('.')[-1].lower()
                in {'otf', 'ttf'}) and mt != guess_type('a.woff'):
            chars = font_stats.get(name, set())
            with container.open(name, 'rb') as f:
                f.seek(0, os.SEEK_END)
                total_old += f.tell()
            if not chars:
                remove.add(name)
                report('Removed unused font: %s' % name)
                continue
            with container.open(name, 'r+b') as f:
                raw = f.read()
                font_name = get_font_names(raw)[-1]
                warnings = []
                container.log('Subsetting font: %s' % (font_name or name))
                try:
                    nraw, old_sizes, new_sizes = subset(raw,
                                                        chars,
                                                        warnings=warnings)
                except UnsupportedFont as e:
                    container.log.warning(
                        'Unsupported font: %s, ignoring.  Error: %s' %
                        (name, as_unicode(e)))
                    continue

                for w in warnings:
                    container.log.warn(w)
                olen = sum(old_sizes.itervalues())
                nlen = sum(new_sizes.itervalues())
                total_new += len(nraw)
                if nlen == olen:
                    report('The font %s was already subset' % font_name)
                else:
                    report(
                        'Decreased the font %s to %.1f%% of its original size'
                        % (font_name, nlen / olen * 100))
                    changed = True
                f.seek(0), f.truncate(), f.write(nraw)

    for name in remove:
        container.remove_item(name)
        changed = True

    if remove:
        for name, mt in container.mime_map.iteritems():
            if mt in OEB_STYLES:
                sheet = container.parsed(name)
                if remove_font_face_rules(container, sheet, remove, name):
                    container.dirty(name)
            elif mt in OEB_DOCS:
                for style in XPath('//h:style')(container.parsed(name)):
                    if style.get('type',
                                 'text/css') == 'text/css' and style.text:
                        sheet = container.parse_css(style.text, name)
                        if remove_font_face_rules(container, sheet, remove,
                                                  name):
                            style.text = sheet.cssText
                            container.dirty(name)
    if total_old > 0:
        report('Reduced total font size to %.1f%% of original' %
               (total_new / total_old * 100))
    else:
        report('No embedded fonts found')
    return changed
Example #38
0
def check_opf(container):
    errors = []

    if container.opf.tag != OPF('package'):
        err = BaseError(_('The OPF does not have the correct root element'),
                        container.opf_name)
        err.HELP = xml(
            _('The opf must have the root element <package> in namespace {0}, like this: <package xmlns="{0}">'
              )).format(OPF2_NS)
        errors.append(err)

    for tag in ('metadata', 'manifest', 'spine'):
        if not container.opf_xpath('/opf:package/opf:' + tag):
            errors.append(MissingSection(container.opf_name, tag))

    all_ids = set(container.opf_xpath('//*/@id'))
    for elem in container.opf_xpath('//*[@idref]'):
        if elem.get('idref') not in all_ids:
            errors.append(
                IncorrectIdref(container.opf_name, elem.get('idref'),
                               elem.sourceline))

    nl_items = [
        elem.sourceline for elem in container.opf_xpath(
            '//opf:spine/opf:itemref[@linear="no"]')
    ]
    if nl_items:
        errors.append(NonLinearItems(container.opf_name, nl_items))

    seen, dups = {}, {}
    for item in container.opf_xpath(
            '/opf:package/opf:manifest/opf:item[@href]'):
        href = item.get('href')
        hname = container.href_to_name(href, container.opf_name)
        if not hname or not container.exists(hname):
            errors.append(
                MissingHref(container.opf_name, href, item.sourceline))
        if href in seen:
            if href not in dups:
                dups[href] = [seen[href]]
            dups[href].append(item.sourceline)
        else:
            seen[href] = item.sourceline
    errors.extend(
        DuplicateHref(container.opf_name, eid, locs)
        for eid, locs in dups.iteritems())

    seen, dups = {}, {}
    for item in container.opf_xpath(
            '/opf:package/opf:spine/opf:itemref[@idref]'):
        ref = item.get('idref')
        if ref in seen:
            if ref not in dups:
                dups[ref] = [seen[ref]]
            dups[ref].append(item.sourceline)
        else:
            seen[ref] = item.sourceline
    errors.extend(
        DuplicateHref(container.opf_name, eid, locs, for_spine=True)
        for eid, locs in dups.iteritems())

    spine = container.opf_xpath('/opf:package/opf:spine[@toc]')
    if spine:
        spine = spine[0]
        mitems = [
            x for x in container.opf_xpath(
                '/opf:package/opf:manifest/opf:item[@id]')
            if x.get('id') == spine.get('toc')
        ]
        if mitems:
            mitem = mitems[0]
            if mitem.get('media-type', '') != guess_type('a.ncx'):
                errors.append(
                    IncorrectToc(container.opf_name,
                                 mitem.sourceline,
                                 bad_mimetype=mitem.get('media-type')))
        else:
            errors.append(
                IncorrectToc(container.opf_name,
                             spine.sourceline,
                             bad_idref=spine.get('toc')))

    covers = container.opf_xpath(
        '/opf:package/opf:metadata/opf:meta[@name="cover"]')
    if len(covers) > 0:
        if len(covers) > 1:
            errors.append(
                MultipleCovers(container.opf_name,
                               [c.sourceline for c in covers]))
        manifest_ids = set(
            container.opf_xpath('/opf:package/opf:manifest/opf:item/@id'))
        for cover in covers:
            if cover.get('content', None) not in manifest_ids:
                errors.append(
                    IncorrectCover(container.opf_name, cover.sourceline,
                                   cover.get('content', '')))
            raw = etree.tostring(cover)
            try:
                n, c = raw.index('name="'), raw.index('content="')
            except ValueError:
                n = c = -1
            if n > -1 and c > -1 and n > c:
                errors.append(NookCover(container.opf_name, cover.sourceline))

    uid = container.opf.get('unique-identifier', None)
    if uid is None or not container.opf_xpath(
            '/opf:package/opf:metadata/dc:identifier[@id=%r]' % uid):
        errors.append(NoUID(container.opf_name))

    for item, name, linear in container.spine_iter:
        mt = container.mime_map[name]
        if mt != XHTML_MIME:
            iid = item.get('idref', None)
            lnum = None
            if iid:
                mitem = container.opf_xpath(
                    '/opf:package/opf:manifest/opf:item[@id=%r]' % iid)
                if mitem:
                    lnum = mitem[0].sourceline
                else:
                    iid = None
            errors.append(BadSpineMime(name, iid, mt, lnum,
                                       container.opf_name))

    return errors
Example #39
0
def create_book(mi,
                path,
                fmt='epub',
                opf_name='metadata.opf',
                html_name='start.xhtml',
                toc_name='toc.ncx'):
    ''' Create an empty book in the specified format at the specified location. '''
    if fmt not in valid_empty_formats:
        raise ValueError('Cannot create empty book in the %s format' % fmt)
    if fmt == 'txt':
        with open(path, 'wb') as f:
            if not mi.is_null('title'):
                f.write(mi.title)
        return
    if fmt == 'docx':
        from calibre.ebooks.conversion.plumber import Plumber
        from calibre.ebooks.docx.writer.container import DOCX
        from calibre.utils.logging import default_log
        p = Plumber('a.docx', 'b.docx', default_log)
        p.setup_options()
        # Use the word default of one inch page margins
        for x in 'left right top bottom'.split():
            setattr(p.opts, 'margin_' + x, 72)
        DOCX(p.opts, default_log).write(path, mi, create_empty_document=True)
        return
    path = os.path.abspath(path)
    lang = 'und'
    opf = metadata_to_opf(mi, as_string=False)
    for l in opf.xpath('//*[local-name()="language"]'):
        if l.text:
            lang = l.text
            break
    lang = lang_as_iso639_1(lang) or lang

    opfns = OPF_NAMESPACES['opf']
    m = opf.makeelement('{%s}manifest' % opfns)
    opf.insert(1, m)
    i = m.makeelement('{%s}item' % opfns, href=html_name, id='start')
    i.set('media-type', guess_type('a.xhtml'))
    m.append(i)
    i = m.makeelement('{%s}item' % opfns, href=toc_name, id='ncx')
    i.set('media-type', guess_type(toc_name))
    m.append(i)
    s = opf.makeelement('{%s}spine' % opfns, toc="ncx")
    opf.insert(2, s)
    i = s.makeelement('{%s}itemref' % opfns, idref='start')
    s.append(i)
    CONTAINER = '''\
<?xml version="1.0"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
   <rootfiles>
      <rootfile full-path="{0}" media-type="application/oebps-package+xml"/>
   </rootfiles>
</container>
    '''.format(prepare_string_for_xml(opf_name, True)).encode('utf-8')
    HTML = P('templates/new_book.html', data=True).decode('utf-8').replace(
        '_LANGUAGE_', prepare_string_for_xml(lang, True)).replace(
            '_TITLE_', prepare_string_for_xml(mi.title)).replace(
                '_AUTHORS_',
                prepare_string_for_xml(authors_to_string(
                    mi.authors))).encode('utf-8')
    h = parse(HTML)
    pretty_html_tree(None, h)
    HTML = serialize(h, 'text/html')
    ncx = etree.tostring(create_toc(mi, opf, html_name, lang),
                         encoding='utf-8',
                         xml_declaration=True,
                         pretty_print=True)
    pretty_xml_tree(opf)
    opf = etree.tostring(opf,
                         encoding='utf-8',
                         xml_declaration=True,
                         pretty_print=True)
    if fmt == 'azw3':
        with TemporaryDirectory('create-azw3') as tdir, CurrentDir(tdir):
            for name, data in ((opf_name, opf), (html_name, HTML), (toc_name,
                                                                    ncx)):
                with open(name, 'wb') as f:
                    f.write(data)
            c = Container(os.path.dirname(os.path.abspath(opf_name)), opf_name,
                          DevNull())
            opf_to_azw3(opf_name, path, c)
    else:
        with ZipFile(path, 'w', compression=ZIP_STORED) as zf:
            zf.writestr('mimetype',
                        b'application/epub+zip',
                        compression=ZIP_STORED)
            zf.writestr('META-INF/', b'', 0o755)
            zf.writestr('META-INF/container.xml', CONTAINER)
            zf.writestr(opf_name, opf)
            zf.writestr(html_name, HTML)
            zf.writestr(toc_name, ncx)
Example #40
0
def check_links(container):
    links_map = defaultdict(set)
    xml_types = {guess_type('a.opf'), guess_type('a.ncx')}
    errors = []
    a = errors.append

    def fl(x):
        x = repr(x)
        if x.startswith('u'):
            x = x[1:]
        return x

    for name, mt in iteritems(container.mime_map):
        if mt in OEB_DOCS or mt in OEB_STYLES or mt in xml_types:
            for href, lnum, col in container.iterlinks(name):
                if not href:
                    a(EmptyLink(_('The link is empty'), name, lnum, col))
                try:
                    tname = container.href_to_name(href, name)
                except ValueError:
                    tname = None  # Absolute paths to files on another drive in windows cause this
                if tname is not None:
                    if container.exists(tname):
                        if tname in container.mime_map:
                            links_map[name].add(tname)
                        else:
                            # Filesystem says the file exists, but it is not in
                            # the mime_map, so either there is a case mismatch
                            # or the link is a directory
                            apath = container.name_to_abspath(tname)
                            if os.path.isdir(apath):
                                a(
                                    BadLink(
                                        _('The linked resource %s is a folder')
                                        % fl(href), name, lnum, col))
                            else:
                                a(
                                    CaseMismatch(
                                        href,
                                        actual_case_for_name(container, tname),
                                        name, lnum, col))
                    else:
                        cname = corrected_case_for_name(container, tname)
                        if cname is not None:
                            a(CaseMismatch(href, cname, name, lnum, col))
                        else:
                            a(
                                DanglingLink(
                                    _('The linked resource %s does not exist')
                                    % fl(href), tname, name, lnum, col))
                else:
                    purl = urlparse(href)
                    if purl.scheme == 'file':
                        a(
                            FileLink(
                                _('The link %s is a file:// URL') % fl(href),
                                name, lnum, col))
                    elif purl.path and purl.path.startswith(
                            '/') and purl.scheme in {'', 'file'}:
                        a(
                            LocalLink(
                                _('The link %s points to a file outside the book'
                                  ) % fl(href), name, lnum, col))
                    elif purl.path and purl.scheme in {
                            '', 'file'
                    } and ':' in urlunquote(purl.path):
                        a(
                            InvalidCharInLink(
                                _('The link %s contains a : character, this will cause errors on Windows computers'
                                  ) % fl(href), name, lnum, col))

    spine_docs = {name for name, linear in container.spine_names}
    spine_styles = {
        tname
        for name in spine_docs for tname in links_map[name]
        if container.mime_map.get(tname, None) in OEB_STYLES
    }
    num = -1
    while len(spine_styles) > num:
        # Handle import rules in stylesheets
        num = len(spine_styles)
        spine_styles |= {
            tname
            for name in spine_styles for tname in links_map[name]
            if container.mime_map.get(tname, None) in OEB_STYLES
        }
    seen = set(OEB_DOCS) | set(OEB_STYLES)
    spine_resources = {
        tname
        for name in spine_docs | spine_styles for tname in links_map[name]
        if container.mime_map[tname] not in seen
    }
    unreferenced = set()

    cover_name = container.guide_type_map.get('cover', None)
    nav_items = frozenset(container.manifest_items_with_property('nav'))

    for name, mt in iteritems(container.mime_map):
        if mt in OEB_STYLES and name not in spine_styles:
            a(UnreferencedResource(name))
        elif mt in OEB_DOCS and name not in spine_docs and name not in nav_items:
            a(UnreferencedDoc(name))
        elif (mt in OEB_FONTS or mt.partition('/')[0] in {
                'image', 'audio', 'video'
        }) and name not in spine_resources and name != cover_name:
            if mt.partition('/')[
                    0] == 'image' and name == get_raster_cover_name(container):
                continue
            a(UnreferencedResource(name))
        else:
            continue
        unreferenced.add(name)

    manifest_names = set(itervalues(container.manifest_id_map))
    for name in container.mime_map:
        if name not in manifest_names and not container.ok_to_be_unmanifested(
                name):
            a(Unmanifested(name, unreferenced=name in unreferenced))
        if name == 'META-INF/calibre_bookmarks.txt':
            a(Bookmarks(name))

    return errors
Example #41
0
def check_opf(container):
    errors = []
    opf_version = container.opf_version_parsed

    if container.opf.tag != OPF('package'):
        err = BaseError(_('The OPF does not have the correct root element'),
                        container.opf_name, container.opf.sourceline)
        err.HELP = xml(
            _('The opf must have the root element <package> in namespace {0}, like this: <package xmlns="{0}">'
              )).format(OPF2_NS)
        errors.append(err)

    elif container.opf.get(
            'version') is None and container.book_type == 'epub':
        err = BaseError(_('The OPF does not have a version'),
                        container.opf_name, container.opf.sourceline)
        err.HELP = xml(
            _('The <package> tag in the OPF must have a version attribute. This is usually version="2.0" for EPUB2 and AZW3 and version="3.0" for EPUB3'
              ))
        errors.append(err)

    for tag in ('metadata', 'manifest', 'spine'):
        if not container.opf_xpath('/opf:package/opf:' + tag):
            errors.append(MissingSection(container.opf_name, tag))

    all_ids = set(container.opf_xpath('//*/@id'))
    if '' in all_ids:
        for empty_id_tag in container.opf_xpath('//*[@id=""]'):
            errors.append(EmptyID(container.opf_name, empty_id_tag.sourceline))
    all_ids.discard('')
    for elem in container.opf_xpath('//*[@idref]'):
        if elem.get('idref') not in all_ids:
            errors.append(
                IncorrectIdref(container.opf_name, elem.get('idref'),
                               elem.sourceline))

    nl_items = [
        elem.sourceline for elem in container.opf_xpath(
            '//opf:spine/opf:itemref[@linear="no"]')
    ]
    if nl_items:
        errors.append(NonLinearItems(container.opf_name, nl_items))

    seen, dups = {}, {}
    for item in container.opf_xpath('/opf:package/opf:manifest/opf:item'):
        href = item.get('href', None)
        if href is None:
            errors.append(
                NoHref(container.opf_name, item.get('id', None),
                       item.sourceline))
        else:
            hname = container.href_to_name(href, container.opf_name)
            if not hname or not container.exists(hname):
                errors.append(
                    MissingHref(container.opf_name, href, item.sourceline))
            if href in seen:
                if href not in dups:
                    dups[href] = [seen[href]]
                dups[href].append(item.sourceline)
            else:
                seen[href] = item.sourceline
    errors.extend(
        DuplicateHref(container.opf_name, eid, locs)
        for eid, locs in iteritems(dups))

    seen, dups = {}, {}
    for item in container.opf_xpath(
            '/opf:package/opf:spine/opf:itemref[@idref]'):
        ref = item.get('idref')
        if ref in seen:
            if ref not in dups:
                dups[ref] = [seen[ref]]
            dups[ref].append(item.sourceline)
        else:
            seen[ref] = item.sourceline
    errors.extend(
        DuplicateHref(container.opf_name, eid, locs, for_spine=True)
        for eid, locs in iteritems(dups))

    spine = container.opf_xpath('/opf:package/opf:spine[@toc]')
    if spine:
        spine = spine[0]
        mitems = [
            x for x in container.opf_xpath(
                '/opf:package/opf:manifest/opf:item[@id]')
            if x.get('id') == spine.get('toc')
        ]
        if mitems:
            mitem = mitems[0]
            if mitem.get('media-type', '') != guess_type('a.ncx'):
                errors.append(
                    IncorrectToc(container.opf_name,
                                 mitem.sourceline,
                                 bad_mimetype=mitem.get('media-type')))
        else:
            errors.append(
                IncorrectToc(container.opf_name,
                             spine.sourceline,
                             bad_idref=spine.get('toc')))
    else:
        spine = container.opf_xpath('/opf:package/opf:spine')
        if spine:
            spine = spine[0]
            ncx = container.manifest_type_map.get(guess_type('a.ncx'))
            if ncx:
                ncx_name = ncx[0]
                rmap = {v: k for k, v in iteritems(container.manifest_id_map)}
                ncx_id = rmap.get(ncx_name)
                if ncx_id:
                    errors.append(
                        MissingNCXRef(container.opf_name, spine.sourceline,
                                      ncx_id))

    if opf_version.major > 2:
        existing_nav = find_existing_nav_toc(container)
        if existing_nav is None:
            errors.append(MissingNav(container.opf_name, 0))
        else:
            toc = parse_nav(container, existing_nav)
            if len(toc) == 0:
                errors.append(EmptyNav(existing_nav, 0))

    covers = container.opf_xpath(
        '/opf:package/opf:metadata/opf:meta[@name="cover"]')
    if len(covers) > 0:
        if len(covers) > 1:
            errors.append(
                MultipleCovers(container.opf_name,
                               [c.sourceline for c in covers]))
        manifest_ids = set(
            container.opf_xpath('/opf:package/opf:manifest/opf:item/@id'))
        for cover in covers:
            if cover.get('content', None) not in manifest_ids:
                errors.append(
                    IncorrectCover(container.opf_name, cover.sourceline,
                                   cover.get('content', '')))
            raw = etree.tostring(cover)
            try:
                n, c = raw.index(b'name="'), raw.index(b'content="')
            except ValueError:
                n = c = -1
            if n > -1 and c > -1 and n > c:
                errors.append(NookCover(container.opf_name, cover.sourceline))

    uid = container.opf.get('unique-identifier', None)
    if uid is None or not container.opf_xpath(
            '/opf:package/opf:metadata/dc:identifier[@id=%r]' % uid):
        errors.append(NoUID(container.opf_name))
    for elem in container.opf_xpath('/opf:package/opf:metadata/dc:identifier'):
        if not elem.text or not elem.text.strip():
            errors.append(EmptyIdentifier(container.opf_name, elem.sourceline))

    for item, name, linear in container.spine_iter:
        mt = container.mime_map[name]
        if mt != XHTML_MIME:
            iid = item.get('idref', None)
            lnum = None
            if iid:
                mitem = container.opf_xpath(
                    '/opf:package/opf:manifest/opf:item[@id=%r]' % iid)
                if mitem:
                    lnum = mitem[0].sourceline
                else:
                    iid = None
            errors.append(BadSpineMime(name, iid, mt, lnum,
                                       container.opf_name))

    return errors
Example #42
0
def iter_subsettable_fonts(container):
    woff_font_types = guess_type('a.woff'), guess_type('a.woff2')
    for name, mt in iteritems(container.mime_map):
        if (mt in OEB_FONTS or name.rpartition('.')[-1].lower() in {'otf', 'ttf'}) and mt not in woff_font_types:
            yield name, mt
Example #43
0
                                     rewrite_links, iterlinks, itercsslinks,
                                     urlquote, urlunquote)
from calibre.ebooks.oeb.polish.errors import InvalidBook, DRMError
from calibre.ebooks.oeb.polish.parsing import parse as parse_html_tweak
from calibre.ebooks.oeb.polish.utils import PositionFinder, CommentFinder, guess_type
from calibre.ebooks.oeb.parse_utils import NotHTML, parse_html, RECOVER_PARSER
from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile
from calibre.utils.filenames import nlinks_file, hardlink_file
from calibre.utils.ipc.simple_worker import fork_job, WorkerError
from calibre.utils.logging import default_log
from calibre.utils.zipfile import ZipFile

exists, join, relpath = os.path.exists, os.path.join, os.path.relpath

OEB_FONTS = {
    guess_type('a.ttf'),
    guess_type('b.otf'),
    guess_type('a.woff'), 'application/x-font-ttf', 'application/x-font-otf'
}
OPF_NAMESPACES = {'opf': OPF2_NS, 'dc': DC11_NS}


class CSSPreProcessor(cssp):
    def __call__(self, data):
        return self.MS_PAT.sub(self.ms_sub, data)


def clone_dir(src, dest):
    ' Clone a directory using hard links for the files, dest must already exist '
    for x in os.listdir(src):
        dpath = os.path.join(dest, x)
Example #44
0
    def __init__(self, path_to_ebook, tdir, log=None, book_hash=None):
        log = log or default_log
        book_fmt, opfpath, input_fmt = extract_book(path_to_ebook, tdir, log=log)
        ContainerBase.__init__(self, tdir, opfpath, log)
        excluded_names = {
            name for name, mt in self.mime_map.iteritems() if
            name == self.opf_name or mt == guess_type('a.ncx') or name.startswith('META-INF/') or
            name == 'mimetype'
        }
        raster_cover_name, titlepage_name = self.create_cover_page(input_fmt.lower())
        toc = get_toc(self).to_dict(count())
        spine = [name for name, is_linear in self.spine_names]
        spineq = frozenset(spine)
        landmarks = [l for l in get_landmarks(self) if l['dest'] in spineq]

        self.book_render_data = data = {
            'version': RENDER_VERSION,
            'toc':toc,
            'spine':spine,
            'link_uid': uuid4(),
            'book_hash': book_hash,
            'is_comic': input_fmt.lower() in {'cbc', 'cbz', 'cbr', 'cb7'},
            'raster_cover_name': raster_cover_name,
            'title_page_name': titlepage_name,
            'has_maths': False,
            'total_length': 0,
            'spine_length': 0,
            'toc_anchor_map': toc_anchor_map(toc),
            'landmarks': landmarks,
        }
        # Mark the spine as dirty since we have to ensure it is normalized
        for name in data['spine']:
            self.parsed(name), self.dirty(name)
        self.transform_css()
        self.virtualized_names = set()
        self.virtualize_resources()

        def manifest_data(name):
            mt = (self.mime_map.get(name) or 'application/octet-stream').lower()
            ans = {
                'size':os.path.getsize(self.name_path_map[name]),
                'is_virtualized': name in self.virtualized_names,
                'mimetype':mt,
                'is_html': mt in OEB_DOCS,
            }
            if ans['is_html']:
                root = self.parsed(name)
                ans['length'] = l = get_length(root)
                self.book_render_data['total_length'] += l
                if name in data['spine']:
                    self.book_render_data['spine_length'] += l
                ans['has_maths'] = hm = check_for_maths(root)
                if hm:
                    self.book_render_data['has_maths'] = True
                ans['anchor_map'] = anchor_map(root)
            return ans
        data['files'] = {name:manifest_data(name) for name in set(self.name_path_map) - excluded_names}
        self.commit()
        for name in excluded_names:
            os.remove(self.name_path_map[name])
        with lopen(os.path.join(self.root, 'calibre-book-manifest.json'), 'wb') as f:
            f.write(json.dumps(self.book_render_data, ensure_ascii=False).encode('utf-8'))