Ejemplo n.º 1
0
 def link_replacer(base, url):
     if url.startswith('#'):
         frag = urlunquote(url[1:])
         if not frag:
             return url
         changed.add(base)
         return resource_template.format(encode_url(base, frag))
     purl = urlparse(url)
     if purl.netloc or purl.query:
         return url
     if purl.scheme and purl.scheme != 'file':
         return url
     if not purl.path or purl.path.startswith('/'):
         return url
     url, frag = purl.path, purl.fragment
     name = self.href_to_name(url, base)
     if name:
         if self.has_name_and_is_not_empty(name):
             frag = urlunquote(frag)
             url = resource_template.format(encode_url(name, frag))
         else:
             if isinstance(name, unicode_type):
                 name = name.encode('utf-8')
             url = 'missing:' + force_unicode(quote(name), 'utf-8')
         changed.add(base)
     return url
Ejemplo n.º 2
0
 def link_replacer(base, url):
     if url.startswith('#'):
         frag = urlunquote(url[1:])
         if not frag:
             return url
         changed.add(base)
         return resource_template.format(encode_url(base, frag))
     purl = urlparse(url)
     if purl.netloc or purl.query:
         return url
     if purl.scheme and purl.scheme != 'file':
         return url
     if not purl.path or purl.path.startswith('/'):
         return url
     url, frag = purl.path, purl.fragment
     name = container.href_to_name(url, base)
     if name:
         if container.has_name_and_is_not_empty(name):
             frag = urlunquote(frag)
             url = resource_template.format(encode_url(name, frag))
         else:
             if isinstance(name, unicode_type):
                 name = name.encode('utf-8')
             url = 'missing:' + force_unicode(quote(name), 'utf-8')
         changed.add(base)
     return url
Ejemplo n.º 3
0
 def __call__(self, oeb, opts):
     self.log = oeb.log
     attr_path = XPath('//h:img[@src]')
     for item in oeb.spine:
         root = item.data
         if not hasattr(root, 'xpath'):
             continue
         for img in attr_path(root):
             raw = img.get('src', '')
             if not raw.startswith('data:'):
                 continue
             header, data = raw.partition(',')[0::2]
             if not header.startswith('data:image/') or not data:
                 continue
             if ';base64' in header:
                 data = re.sub(r'\s+', '', data)
                 from polyglot.binary import from_base64_bytes
                 try:
                     data = from_base64_bytes(data)
                 except Exception:
                     self.log.error('Found invalid base64 encoded data URI, ignoring it')
                     continue
             else:
                 data = urlunquote(data)
             from imghdr import what
             fmt = what(None, data)
             if not fmt:
                 self.log.warn('Image encoded as data URL has unknown format, ignoring')
                 continue
             img.set('src', item.relhref(self.convert_image_data_uri(data, fmt, oeb)))
Ejemplo n.º 4
0
 def __call__(self, oeb, opts):
     self.log = oeb.log
     attr_path = XPath('//h:img[@src]')
     for item in oeb.spine:
         root = item.data
         if not hasattr(root, 'xpath'):
             continue
         for img in attr_path(root):
             raw = img.get('src', '')
             if not raw.startswith('data:'):
                 continue
             header, data = raw.partition(',')[0::2]
             if not header.startswith('data:image/') or not data:
                 continue
             if ';base64' in header:
                 data = re.sub(r'\s+', '', data)
                 from base64 import b64decode
                 try:
                     data = b64decode(data)
                 except Exception:
                     self.log.error('Found invalid base64 encoded data URI, ignoring it')
                     continue
             else:
                 data = urlunquote(data)
             from imghdr import what
             fmt = what(None, data)
             if not fmt:
                 self.log.warn('Image encoded as data URL has unknown format, ignoring')
                 continue
             img.set('src', item.relhref(self.convert_image_data_uri(data, fmt, oeb)))
Ejemplo n.º 5
0
    def create_image_markup(self, html_img, stylizer, href, as_block=False):
        # TODO: img inside a link (clickable image)
        style = stylizer.style(html_img)
        floating = style['float']
        if floating not in {'left', 'right'}:
            floating = None
        if as_block:
            ml, mr = style._get('margin-left'), style._get('margin-right')
            if ml == 'auto':
                floating = 'center' if mr == 'auto' else 'right'
            if mr == 'auto':
                floating = 'center' if ml == 'auto' else 'right'
        else:
            parent = html_img.getparent()
            if len(parent) == 1 and not (parent.text or '').strip() and not (html_img.tail or '').strip():
                # We have an inline image alone inside a block
                pstyle = stylizer.style(parent)
                if pstyle['text-align'] in ('center', 'right') and 'block' in pstyle['display']:
                    floating = pstyle['text-align']
        fake_margins = floating is None
        self.count += 1
        img = self.images[href]
        name = urlunquote(posixpath.basename(href))
        width, height = style.img_size(img.width, img.height)
        scaled, width, height = fit_image(width, height, self.page_width, self.page_height)
        width, height = map(pt_to_emu, (width, height))

        makeelement, namespaces = self.document_relationships.namespace.makeelement, self.document_relationships.namespace.namespaces

        root = etree.Element('root', nsmap=namespaces)
        ans = makeelement(root, 'w:drawing', append=False)
        if floating is None:
            parent = makeelement(ans, 'wp:inline')
        else:
            parent = makeelement(ans, 'wp:anchor', **get_image_margins(style))
            # The next three lines are boilerplate that Word requires, even
            # though the DOCX specs define defaults for all of them
            parent.set('simplePos', '0'), parent.set('relativeHeight', '1'), parent.set('behindDoc',"0"), parent.set('locked', "0")
            parent.set('layoutInCell', "1"), parent.set('allowOverlap', '1')
            makeelement(parent, 'wp:simplePos', x='0', y='0')
            makeelement(makeelement(parent, 'wp:positionH', relativeFrom='margin'), 'wp:align').text = floating
            makeelement(makeelement(parent, 'wp:positionV', relativeFrom='line'), 'wp:align').text = 'top'
        makeelement(parent, 'wp:extent', cx=str(width), cy=str(height))
        if fake_margins:
            # DOCX does not support setting margins for inline images, so we
            # fake it by using effect extents to simulate margins
            makeelement(parent, 'wp:effectExtent', **{k[-1].lower():v for k, v in get_image_margins(style).iteritems()})
        else:
            makeelement(parent, 'wp:effectExtent', l='0', r='0', t='0', b='0')
        if floating is not None:
            # The idiotic Word requires this to be after the extent settings
            if as_block:
                makeelement(parent, 'wp:wrapTopAndBottom')
            else:
                makeelement(parent, 'wp:wrapSquare', wrapText='bothSides')
        self.create_docx_image_markup(parent, name, html_img.get('alt') or name, img.rid, width, height)
        return ans
Ejemplo n.º 6
0
 def url_to_local_path(cls, url, base):
     path = url.path
     isabs = False
     if iswindows and path.startswith('/'):
         path = path[1:]
         isabs = True
     path = urlunparse(('', '', path, url.params, url.query, ''))
     path = urlunquote(path)
     if isabs or os.path.isabs(path):
         return path
     return os.path.abspath(os.path.join(base, path))
Ejemplo n.º 7
0
 def url_to_local_path(cls, url, base):
     path = url.path
     isabs = False
     if iswindows and path.startswith('/'):
         path = path[1:]
         isabs = True
     path = urlunparse(('', '', path, url.params, url.query, ''))
     path = urlunquote(path)
     if isabs or os.path.isabs(path):
         return path
     return os.path.abspath(os.path.join(base, path))
Ejemplo n.º 8
0
 def link_replacer(base, url):
     if url.startswith('#'):
         frag = urlunquote(url[1:])
         if not frag:
             return url
         changed.add(base)
         return resource_template.format(encode_url(base, frag))
     purl = urlparse(url)
     if purl.netloc or purl.query:
         return url
     if purl.scheme and purl.scheme != 'file':
         return url
     if not purl.path or purl.path.startswith('/'):
         return url
     url, frag = purl.path, purl.fragment
     name = self.href_to_name(url, base)
     if name:
         frag = urlunquote(frag)
         url = resource_template.format(encode_url(name, frag))
         changed.add(base)
     return url
Ejemplo n.º 9
0
 def create_filename(self, href, fmt):
     fname = ascii_filename(urlunquote(posixpath.basename(href)))
     fname = posixpath.splitext(fname)[0]
     fname = fname[:75].rstrip('.') or 'image'
     num = 0
     base = fname
     while fname.lower() in self.seen_filenames:
         num += 1
         fname = base + str(num)
     self.seen_filenames.add(fname.lower())
     fname += os.extsep + fmt.lower()
     return fname
Ejemplo n.º 10
0
    def __init__(self, pathtoepub, log, clone_data=None, tdir=None):
        if clone_data is not None:
            super(EpubContainer, self).__init__(None,
                                                None,
                                                log,
                                                clone_data=clone_data)
            for x in ('pathtoepub', 'obfuscated_fonts'):
                setattr(self, x, clone_data[x])
            return

        self.pathtoepub = pathtoepub
        if tdir is None:
            tdir = PersistentTemporaryDirectory('_epub_container')
        tdir = os.path.abspath(os.path.realpath(tdir))
        self.root = tdir
        with open(self.pathtoepub, 'rb') as stream:
            try:
                zf = ZipFile(stream)
                zf.extractall(tdir)
            except:
                log.exception('EPUB appears to be invalid ZIP file, trying a'
                              ' more forgiving ZIP parser')
                from calibre.utils.localunzip import extractall
                stream.seek(0)
                extractall(stream)
        try:
            os.remove(join(tdir, 'mimetype'))
        except EnvironmentError:
            pass

        container_path = join(self.root, 'META-INF', 'container.xml')
        if not exists(container_path):
            raise InvalidEpub('No META-INF/container.xml in epub')
        container = etree.fromstring(open(container_path, 'rb').read())
        opf_files = container.xpath(
            (r'child::ocf:rootfiles/ocf:rootfile'
             '[@media-type="%s" and @full-path]' % guess_type('a.opf')),
            namespaces={'ocf': OCF_NS})
        if not opf_files:
            raise InvalidEpub(
                'META-INF/container.xml contains no link to OPF file')
        opf_path = os.path.join(
            self.root, *(urlunquote(opf_files[0].get('full-path')).split('/')))
        if not exists(opf_path):
            raise InvalidEpub('OPF file does not exist at location pointed to'
                              ' by META-INF/container.xml')

        super(EpubContainer, self).__init__(tdir, opf_path, log)

        self.obfuscated_fonts = {}
        if 'META-INF/encryption.xml' in self.name_path_map:
            self.process_encryption()
        self.parsed_cache['META-INF/container.xml'] = container
Ejemplo n.º 11
0
 def create_filename(self, href, fmt):
     fname = ascii_filename(urlunquote(posixpath.basename(href)))
     fname = posixpath.splitext(fname)[0]
     fname = fname[:75].rstrip('.') or 'image'
     num = 0
     base = fname
     while fname.lower() in self.seen_filenames:
         num += 1
         fname = base + str(num)
     self.seen_filenames.add(fname.lower())
     fname += os.extsep + fmt.lower()
     return fname
Ejemplo n.º 12
0
    def create_image_markup(self, html_img, stylizer, href):
        # TODO: img inside a link (clickable image)
        style = stylizer.style(html_img)
        floating = style['float']
        if floating not in {'left', 'right'}:
            floating = None
        fake_margins = floating is None
        self.count += 1
        img = self.images[href]
        name = urlunquote(posixpath.basename(href))
        width, height = map(pt_to_emu, style.img_size(img.width, img.height))

        makeelement, namespaces = self.document_relationships.namespace.makeelement, self.document_relationships.namespace.namespaces

        root = etree.Element('root', nsmap=namespaces)
        ans = makeelement(root, 'w:drawing', append=False)
        if floating is None:
            parent = makeelement(ans, 'wp:inline')
        else:
            parent = makeelement(ans, 'wp:anchor', **get_image_margins(style))
            # The next three lines are boilerplate that Word requires, even
            # though the DOCX specs define defaults for all of them
            parent.set('simplePos', '0'), parent.set('relativeHeight', '1'), parent.set('behindDoc',"0"), parent.set('locked', "0")
            parent.set('layoutInCell', "1"), parent.set('allowOverlap', '1')
            makeelement(parent, 'wp:simplePos', x='0', y='0')
            makeelement(makeelement(parent, 'wp:positionH', relativeFrom='margin'), 'wp:align').text = floating
            makeelement(makeelement(parent, 'wp:positionV', relativeFrom='line'), 'wp:align').text = 'top'
        makeelement(parent, 'wp:extent', cx=str(width), cy=str(width))
        if fake_margins:
            # DOCX does not support setting margins for inline images, so we
            # fake it by using effect extents to simulate margins
            makeelement(parent, 'wp:effectExtent', **{k[-1].lower():v for k, v in get_image_margins(style).iteritems()})
        else:
            makeelement(parent, 'wp:effectExtent', l='0', r='0', t='0', b='0')
        if floating is not None:
            # The idiotic Word requires this to be after the extent settings
            makeelement(parent, 'wp:wrapSquare', wrapText='bothSides')
        makeelement(parent, 'wp:docPr', id=str(self.count), name=name, descr=html_img.get('alt') or name)
        makeelement(makeelement(parent, 'wp:cNvGraphicFramePr'), 'a:graphicFrameLocks', noChangeAspect="1")
        g = makeelement(parent, 'a:graphic')
        gd = makeelement(g, 'a:graphicData', uri=namespaces['pic'])
        pic = makeelement(gd, 'pic:pic')
        nvPicPr = makeelement(pic, 'pic:nvPicPr')
        makeelement(nvPicPr, 'pic:cNvPr', id='0', name=name, descr=html_img.get('alt') or name)
        makeelement(nvPicPr, 'pic:cNvPicPr')
        bf = makeelement(pic, 'pic:blipFill')
        makeelement(bf, 'a:blip', r_embed=img.rid)
        makeelement(makeelement(bf, 'a:stretch'), 'a:fillRect')
        spPr = makeelement(pic, 'pic:spPr')
        xfrm = makeelement(spPr, 'a:xfrm')
        makeelement(xfrm, 'a:off', x='0', y='0'), makeelement(xfrm, 'a:ext', cx=str(width), cy=str(height))
        makeelement(makeelement(spPr, 'a:prstGeom', prst='rect'), 'a:avLst')
        return ans
Ejemplo n.º 13
0
 def link_replacer(base, url):
     if url.startswith('#'):
         frag = urlunquote(url[1:])
         if not frag:
             return url
         changed.add(base)
         return resource_template.format(encode_url(base, frag))
     purl = urlparse(url)
     if purl.netloc or purl.query:
         return url
     if purl.scheme and purl.scheme != 'file':
         return url
     if not purl.path or purl.path.startswith('/'):
         return url
     url, frag = purl.path, purl.fragment
     name = self.href_to_name(url, base)
     if name:
         frag = urlunquote(frag)
         url = resource_template.format(encode_url(name, frag))
         changed.add(base)
     return url
Ejemplo n.º 14
0
    def __init__(self, pathtoepub, log, clone_data=None, tdir=None):
        if clone_data is not None:
            super(EpubContainer, self).__init__(None, None, log, clone_data=clone_data)
            for x in ('pathtoepub', 'obfuscated_fonts'):
                setattr(self, x, clone_data[x])
            return

        self.pathtoepub = pathtoepub
        if tdir is None:
            tdir = PersistentTemporaryDirectory('_epub_container')
        tdir = os.path.abspath(os.path.realpath(tdir))
        self.root = tdir
        with open(self.pathtoepub, 'rb') as stream:
            try:
                zf = ZipFile(stream)
                zf.extractall(tdir)
            except:
                log.exception('EPUB appears to be invalid ZIP file, trying a'
                        ' more forgiving ZIP parser')
                from calibre.utils.localunzip import extractall
                stream.seek(0)
                extractall(stream)
        try:
            os.remove(join(tdir, 'mimetype'))
        except EnvironmentError:
            pass

        container_path = join(self.root, 'META-INF', 'container.xml')
        if not exists(container_path):
            raise InvalidEpub('No META-INF/container.xml in epub')
        container = etree.fromstring(open(container_path, 'rb').read())
        opf_files = container.xpath((
            r'child::ocf:rootfiles/ocf:rootfile'
            '[@media-type="%s" and @full-path]'%guess_type('a.opf')
            ), namespaces={'ocf':OCF_NS}
        )
        if not opf_files:
            raise InvalidEpub('META-INF/container.xml contains no link to OPF file')
        opf_path = os.path.join(self.root, *(urlunquote(opf_files[0].get('full-path')).split('/')))
        if not exists(opf_path):
            raise InvalidEpub('OPF file does not exist at location pointed to'
                    ' by META-INF/container.xml')

        super(EpubContainer, self).__init__(tdir, opf_path, log)

        self.obfuscated_fonts = {}
        if 'META-INF/encryption.xml' in self.name_path_map:
            self.process_encryption()
        self.parsed_cache['META-INF/container.xml'] = container
Ejemplo n.º 15
0
 def href_to_name(self, href, base=None):
     '''
     Convert an href (relative to base) to a name. base must be a name or
     None, in which case self.root is used.
     '''
     if base is None:
         base = self.root
     else:
         base = os.path.dirname(self.name_to_abspath(base))
     purl = urlparse(href)
     if purl.scheme or not purl.path or purl.path.startswith('/'):
         return None
     href = urlunquote(purl.path)
     fullpath = os.path.join(base, *href.split('/'))
     return self.abspath_to_name(fullpath)
Ejemplo n.º 16
0
 def __init__(self, url, base):
     '''
     :param url:  The url this link points to. Must be an unquoted unicode string.
     :param base: The base directory that relative URLs are with respect to.
                  Must be a unicode string.
     '''
     assert isinstance(url, unicode) and isinstance(base, unicode)
     self.url = url
     self.parsed_url = urlparse(self.url)
     self.is_local = self.parsed_url.scheme in ('', 'file')
     self.is_internal = self.is_local and not bool(self.parsed_url.path)
     self.path = None
     self.fragment = urlunquote(self.parsed_url.fragment)
     if self.is_local and not self.is_internal:
         self.path = self.url_to_local_path(self.parsed_url, base)
Ejemplo n.º 17
0
 def href_to_name(self, href, base=None):
     '''
     Convert an href (relative to base) to a name. base must be a name or
     None, in which case self.root is used.
     '''
     if base is None:
         base = self.root
     else:
         base = os.path.dirname(self.name_to_abspath(base))
     purl = urlparse(href)
     if purl.scheme or not purl.path or purl.path.startswith('/'):
         return None
     href = urlunquote(purl.path)
     fullpath = os.path.join(base, *href.split('/'))
     return self.abspath_to_name(fullpath)
Ejemplo n.º 18
0
 def __init__(self, url, base):
     '''
     :param url:  The url this link points to. Must be an unquoted unicode string.
     :param base: The base directory that relative URLs are with respect to.
                  Must be a unicode string.
     '''
     assert isinstance(url, unicode_type) and isinstance(base, unicode_type)
     self.url         = url
     self.parsed_url  = urlparse(self.url)
     self.is_local    = self.parsed_url.scheme in ('', 'file')
     self.is_internal = self.is_local and not bool(self.parsed_url.path)
     self.path        = None
     self.fragment    = urlunquote(self.parsed_url.fragment)
     if self.is_local and not self.is_internal:
         self.path = self.url_to_local_path(self.parsed_url, base)
Ejemplo n.º 19
0
    def __init__(self, pathtoepub, log, clone_data=None, tdir=None):
        if clone_data is not None:
            super(EpubContainer, self).__init__(None, None, log, clone_data=clone_data)
            for x in ("pathtoepub", "obfuscated_fonts"):
                setattr(self, x, clone_data[x])
            return

        self.pathtoepub = pathtoepub
        if tdir is None:
            tdir = PersistentTemporaryDirectory("_epub_container")
        tdir = os.path.abspath(os.path.realpath(tdir))
        self.root = tdir
        with open(self.pathtoepub, "rb") as stream:
            try:
                zf = ZipFile(stream)
                zf.extractall(tdir)
            except:
                log.exception("EPUB appears to be invalid ZIP file, trying a" " more forgiving ZIP parser")
                from calibre.utils.localunzip import extractall

                stream.seek(0)
                extractall(stream)
        try:
            os.remove(join(tdir, "mimetype"))
        except EnvironmentError:
            pass

        container_path = join(self.root, "META-INF", "container.xml")
        if not exists(container_path):
            raise InvalidEpub("No META-INF/container.xml in epub")
        container = etree.fromstring(open(container_path, "rb").read())
        opf_files = container.xpath(
            (r"child::ocf:rootfiles/ocf:rootfile" '[@media-type="%s" and @full-path]' % guess_type("a.opf")),
            namespaces={"ocf": OCF_NS},
        )
        if not opf_files:
            raise InvalidEpub("META-INF/container.xml contains no link to OPF file")
        opf_path = os.path.join(self.root, *(urlunquote(opf_files[0].get("full-path")).split("/")))
        if not exists(opf_path):
            raise InvalidEpub("OPF file does not exist at location pointed to" " by META-INF/container.xml")

        super(EpubContainer, self).__init__(tdir, opf_path, log)

        self.obfuscated_fonts = {}
        if "META-INF/encryption.xml" in self.name_path_map:
            self.process_encryption()
        self.parsed_cache["META-INF/container.xml"] = container
Ejemplo n.º 20
0
 def verify_links(self):
     spine_paths = {s:s for s in self.spine}
     for item in self.spine:
         base = os.path.dirname(item)
         for link in item.all_links:
             try:
                 p = urlparse(urlunquote(link))
             except Exception:
                 continue
             if not p.scheme and not p.netloc:
                 path = os.path.abspath(os.path.join(base, p.path)) if p.path else item
                 try:
                     path = spine_paths[path]
                 except Exception:
                     continue
                 if not p.fragment or p.fragment in path.anchor_map:
                     item.verified_links.add((path, p.fragment))
Ejemplo n.º 21
0
    def rewrite_links(self, url):
        href, frag = urldefrag(url)
        try:
            href = self.current_item.abshref(href)
        except ValueError:
            # Unparseable URL
            return url
        href = urlnormalize(href)
        if href in self.map:
            anchor_map = self.map[href]
            nhref = anchor_map[frag if frag else None]
            nhref = self.current_item.relhref(nhref)
            if frag:
                nhref = '#'.join((urlunquote(nhref), frag))

            return nhref
        return url
Ejemplo n.º 22
0
 def verify_links(self):
     spine_paths = {s:s for s in self.spine}
     for item in self.spine:
         base = os.path.dirname(item)
         for link in item.all_links:
             try:
                 p = urlparse(urlunquote(link))
             except Exception:
                 continue
             if not p.scheme and not p.netloc:
                 path = os.path.abspath(os.path.join(base, p.path)) if p.path else item
                 try:
                     path = spine_paths[path]
                 except Exception:
                     continue
                 if not p.fragment or p.fragment in path.anchor_map:
                     item.verified_links.add((path, p.fragment))
Ejemplo n.º 23
0
    def rewrite_links(self, url):
        href, frag = urldefrag(url)
        try:
            href = self.current_item.abshref(href)
        except ValueError:
            # Unparseable URL
            return url
        href = urlnormalize(href)
        if href in self.map:
            anchor_map = self.map[href]
            nhref = anchor_map[frag if frag else None]
            nhref = self.current_item.relhref(nhref)
            if frag:
                nhref = '#'.join((urlunquote(nhref), frag))

            return nhref
        return url
Ejemplo n.º 24
0
 def read_font_fule(self, basedir, css):
     from PyQt4.Qt import QFontDatabase
     import cssutils
     cssutils.log.setLevel(logging.ERROR)
     try:
         sheet = cssutils.parseString(css, validate=False)
     except:
         return
     for rule in sheet.cssRules:
         try:
             s = rule.style
             src = s.getProperty('src').propertyValue[0].uri
             font_family = s.getProperty('font-family').propertyValue[0].value
         except:
             continue
         if not src or not font_family:
             continue
         font_file = os.path.normcase(os.path.abspath(os.path.join(basedir,
             src)))
         if font_file not in self.added_fonts:
             self.added_fonts.add(font_file)
             if not os.path.exists(font_file):
                 from calibre.ebooks.oeb.base import urlunquote
                 ff = urlunquote(font_file, error_handling='replace')
                 if os.path.exists(ff):
                     font_file = ff
             if os.path.exists(font_file):
                 with open(font_file, 'rb') as f:
                     idx = QFontDatabase.addApplicationFontFromData(f.read())
                 if idx > -1:
                     family = map(unicode,
                         QFontDatabase.applicationFontFamilies(idx)).next()
                     self.log('Extracted embedded font:', family, 'from',
                             os.path.basename(font_file))
                     if (family and family != font_family and
                             family not in self.replace_map):
                         self.log('Replacing font family value:',
                                 font_family, 'with', family)
                         self.replace_map[font_family.encode('utf-8')] = \
                                 family.encode('utf-8')
Ejemplo n.º 25
0
def check_links(container):
    links_map = defaultdict(set)
    xml_types = {guess_type('a.opf'), guess_type('a.ncx')}
    errors = []
    a = errors.append

    def fl(x):
        x = repr(x)
        if x.startswith('u'):
            x = x[1:]
        return x

    for name, mt in iteritems(container.mime_map):
        if mt in OEB_DOCS or mt in OEB_STYLES or mt in xml_types:
            for href, lnum, col in container.iterlinks(name):
                if not href:
                    a(EmptyLink(_('The link is empty'), name, lnum, col))
                try:
                    tname = container.href_to_name(href, name)
                except ValueError:
                    tname = None  # Absolute paths to files on another drive in windows cause this
                if tname is not None:
                    if container.exists(tname):
                        if tname in container.mime_map:
                            links_map[name].add(tname)
                        else:
                            # Filesystem says the file exists, but it is not in
                            # the mime_map, so either there is a case mismatch
                            # or the link is a directory
                            apath = container.name_to_abspath(tname)
                            if os.path.isdir(apath):
                                a(BadLink(_('The linked resource %s is a directory') % fl(href), name, lnum, col))
                            else:
                                a(CaseMismatch(href, actual_case_for_name(container, tname), name, lnum, col))
                    else:
                        cname = corrected_case_for_name(container, tname)
                        if cname is not None:
                            a(CaseMismatch(href, cname, name, lnum, col))
                        else:
                            a(DanglingLink(_('The linked resource %s does not exist') % fl(href), tname, name, lnum, col))
                else:
                    purl = urlparse(href)
                    if purl.scheme == 'file':
                        a(FileLink(_('The link %s is a file:// URL') % fl(href), name, lnum, col))
                    elif purl.path and purl.path.startswith('/') and purl.scheme in {'', 'file'}:
                        a(LocalLink(_('The link %s points to a file outside the book') % fl(href), name, lnum, col))
                    elif purl.path and purl.scheme in {'', 'file'} and ':' in urlunquote(purl.path):
                        a(InvalidCharInLink(_('The link %s contains a : character, this will cause errors on Windows computers') % fl(href), name, lnum, col))

    spine_docs = {name for name, linear in container.spine_names}
    spine_styles = {tname for name in spine_docs for tname in links_map[name] if container.mime_map.get(tname, None) in OEB_STYLES}
    num = -1
    while len(spine_styles) > num:
        # Handle import rules in stylesheets
        num = len(spine_styles)
        spine_styles |= {tname for name in spine_styles for tname in links_map[name] if container.mime_map.get(tname, None) in OEB_STYLES}
    seen = set(OEB_DOCS) | set(OEB_STYLES)
    spine_resources = {tname for name in spine_docs | spine_styles for tname in links_map[name] if container.mime_map[tname] not in seen}
    unreferenced = set()

    cover_name = container.guide_type_map.get('cover', None)
    nav_items = frozenset(container.manifest_items_with_property('nav'))

    for name, mt in iteritems(container.mime_map):
        if mt in OEB_STYLES and name not in spine_styles:
            a(UnreferencedResource(name))
        elif mt in OEB_DOCS and name not in spine_docs and name not in nav_items:
            a(UnreferencedDoc(name))
        elif (mt in OEB_FONTS or mt.partition('/')[0] in {'image', 'audio', 'video'}) and name not in spine_resources and name != cover_name:
            if mt.partition('/')[0] == 'image' and name == get_raster_cover_name(container):
                continue
            a(UnreferencedResource(name))
        else:
            continue
        unreferenced.add(name)

    manifest_names = set(itervalues(container.manifest_id_map))
    for name in container.mime_map:
        if name not in manifest_names and not container.ok_to_be_unmanifested(name):
            a(Unmanifested(name, unreferenced=name in unreferenced))
        if name == 'META-INF/calibre_bookmarks.txt':
            a(Bookmarks(name))

    return errors
Ejemplo n.º 26
0
    def convert_epub3_nav(self, nav_path, opf, log, opts):
        from lxml import etree
        from calibre.ebooks.chardet import xml_to_unicode
        from calibre.ebooks.oeb.polish.parsing import parse
        from calibre.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, NCX, urlnormalize, urlunquote, serialize
        from calibre.ebooks.oeb.polish.toc import first_child
        from tempfile import NamedTemporaryFile
        with lopen(nav_path, 'rb') as f:
            raw = f.read()
        raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0]
        root = parse(raw, log=log)
        ncx = etree.fromstring('<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="eng"><navMap/></ncx>')
        navmap = ncx[0]
        et = '{%s}type' % EPUB_NS
        bn = os.path.basename(nav_path)

        def add_from_li(li, parent):
            href = text = None
            for x in li.iterchildren(XHTML('a'), XHTML('span')):
                text = etree.tostring(x, method='text', encoding=unicode, with_tail=False).strip() or ' '.join(x.xpath('descendant-or-self::*/@title')).strip()
                href = x.get('href')
                if href:
                    if href.startswith('#'):
                        href = bn + href
                break
            np = parent.makeelement(NCX('navPoint'))
            parent.append(np)
            np.append(np.makeelement(NCX('navLabel')))
            np[0].append(np.makeelement(NCX('text')))
            np[0][0].text = text
            if href:
                np.append(np.makeelement(NCX('content'), attrib={'src':href}))
            return np

        def process_nav_node(node, toc_parent):
            for li in node.iterchildren(XHTML('li')):
                child = add_from_li(li, toc_parent)
                ol = first_child(li, XHTML('ol'))
                if child is not None and ol is not None:
                    process_nav_node(ol, child)

        for nav in root.iterdescendants(XHTML('nav')):
            if nav.get(et) == 'toc':
                ol = first_child(nav, XHTML('ol'))
                if ol is not None:
                    process_nav_node(ol, navmap)
                    break
        else:
            return

        with NamedTemporaryFile(suffix='.ncx', dir=os.path.dirname(nav_path), delete=False) as f:
            f.write(etree.tostring(ncx, encoding='utf-8'))
        ncx_href = os.path.relpath(f.name, os.getcwdu()).replace(os.sep, '/')
        ncx_id = opf.create_manifest_item(ncx_href, NCX_MIME, append=True).get('id')
        for spine in opf.root.xpath('//*[local-name()="spine"]'):
            spine.set('toc', ncx_id)
        opts.epub3_nav_href = urlnormalize(os.path.relpath(nav_path).replace(os.sep, '/'))
        opts.epub3_nav_parsed = root
        if getattr(self, 'removed_cover', None):
            changed = False
            base_path = os.path.dirname(nav_path)
            for elem in root.xpath('//*[@href]'):
                href, frag = elem.get('href').partition('#')[::2]
                link_path = os.path.relpath(os.path.join(base_path, urlunquote(href)), base_path)
                abs_href = urlnormalize(link_path)
                if abs_href == self.removed_cover:
                    changed = True
                    elem.set('data-calibre-removed-titlepage', '1')
            if changed:
                with open(nav_path, 'wb') as f:
                    f.write(serialize(root, 'application/xhtml+xml'))
Ejemplo n.º 27
0
    def create_image_markup(self, html_img, stylizer, href, as_block=False):
        # TODO: img inside a link (clickable image)
        style = stylizer.style(html_img)
        floating = style['float']
        if floating not in {'left', 'right'}:
            floating = None
        if as_block:
            ml, mr = style._get('margin-left'), style._get('margin-right')
            if ml == 'auto':
                floating = 'center' if mr == 'auto' else 'right'
            if mr == 'auto':
                floating = 'center' if ml == 'auto' else 'right'
        fake_margins = floating is None
        self.count += 1
        img = self.images[href]
        name = urlunquote(posixpath.basename(href))
        width, height = map(pt_to_emu, style.img_size(img.width, img.height))

        makeelement, namespaces = self.document_relationships.namespace.makeelement, self.document_relationships.namespace.namespaces

        root = etree.Element('root', nsmap=namespaces)
        ans = makeelement(root, 'w:drawing', append=False)
        if floating is None:
            parent = makeelement(ans, 'wp:inline')
        else:
            parent = makeelement(ans, 'wp:anchor', **get_image_margins(style))
            # The next three lines are boilerplate that Word requires, even
            # though the DOCX specs define defaults for all of them
            parent.set('simplePos',
                       '0'), parent.set('relativeHeight', '1'), parent.set(
                           'behindDoc', "0"), parent.set('locked', "0")
            parent.set('layoutInCell', "1"), parent.set('allowOverlap', '1')
            makeelement(parent, 'wp:simplePos', x='0', y='0')
            makeelement(
                makeelement(parent, 'wp:positionH', relativeFrom='margin'),
                'wp:align').text = floating
            makeelement(
                makeelement(parent, 'wp:positionV', relativeFrom='line'),
                'wp:align').text = 'top'
        makeelement(parent, 'wp:extent', cx=str(width), cy=str(width))
        if fake_margins:
            # DOCX does not support setting margins for inline images, so we
            # fake it by using effect extents to simulate margins
            makeelement(
                parent, 'wp:effectExtent', **{
                    k[-1].lower(): v
                    for k, v in get_image_margins(style).iteritems()
                })
        else:
            makeelement(parent, 'wp:effectExtent', l='0', r='0', t='0', b='0')
        if floating is not None:
            # The idiotic Word requires this to be after the extent settings
            if as_block:
                makeelement(parent, 'wp:wrapTopAndBottom')
            else:
                makeelement(parent, 'wp:wrapSquare', wrapText='bothSides')
        makeelement(parent,
                    'wp:docPr',
                    id=str(self.count),
                    name=name,
                    descr=html_img.get('alt') or name)
        makeelement(makeelement(parent, 'wp:cNvGraphicFramePr'),
                    'a:graphicFrameLocks',
                    noChangeAspect="1")
        g = makeelement(parent, 'a:graphic')
        gd = makeelement(g, 'a:graphicData', uri=namespaces['pic'])
        pic = makeelement(gd, 'pic:pic')
        nvPicPr = makeelement(pic, 'pic:nvPicPr')
        makeelement(nvPicPr,
                    'pic:cNvPr',
                    id='0',
                    name=name,
                    descr=html_img.get('alt') or name)
        makeelement(nvPicPr, 'pic:cNvPicPr')
        bf = makeelement(pic, 'pic:blipFill')
        makeelement(bf, 'a:blip', r_embed=img.rid)
        makeelement(makeelement(bf, 'a:stretch'), 'a:fillRect')
        spPr = makeelement(pic, 'pic:spPr')
        xfrm = makeelement(spPr, 'a:xfrm')
        makeelement(xfrm, 'a:off', x='0', y='0'), makeelement(xfrm,
                                                              'a:ext',
                                                              cx=str(width),
                                                              cy=str(height))
        makeelement(makeelement(spPr, 'a:prstGeom', prst='rect'), 'a:avLst')
        return ans
Ejemplo n.º 28
0
    def workaround_ade_quirks(self):  # {{{
        '''
        Perform various markup transforms to get the output to render correctly
        in the quirky ADE.
        '''
        from calibre.ebooks.oeb.base import XPath, XHTML, barename, urlunquote

        stylesheet = self.oeb.manifest.main_stylesheet

        # ADE cries big wet tears when it encounters an invalid fragment
        # identifier in the NCX toc.
        frag_pat = re.compile(r'[-A-Za-z0-9_:.]+$')
        for node in self.oeb.toc.iter():
            href = getattr(node, 'href', None)
            if hasattr(href, 'partition'):
                base, _, frag = href.partition('#')
                frag = urlunquote(frag)
                if frag and frag_pat.match(frag) is None:
                    self.log.warn(
                            'Removing fragment identifier %r from TOC as Adobe Digital Editions cannot handle it'%frag)
                    node.href = base

        for x in self.oeb.spine:
            root = x.data
            body = XPath('//h:body')(root)
            if body:
                body = body[0]

            if hasattr(body, 'xpath'):
                # remove <img> tags with empty src elements
                bad = []
                for x in XPath('//h:img')(body):
                    src = x.get('src', '').strip()
                    if src in ('', '#') or src.startswith('http:'):
                        bad.append(x)
                for img in bad:
                    img.getparent().remove(img)

                # Add id attribute to <a> tags that have name
                for x in XPath('//h:a[@name]')(body):
                    if not x.get('id', False):
                        x.set('id', x.get('name'))
                    # The delightful epubcheck has started complaining about <a> tags that
                    # have name attributes.
                    x.attrib.pop('name')

                # Replace <br> that are children of <body> as ADE doesn't handle them
                for br in XPath('./h:br')(body):
                    if br.getparent() is None:
                        continue
                    try:
                        prior = br.itersiblings(preceding=True).next()
                        priortag = barename(prior.tag)
                        priortext = prior.tail
                    except:
                        priortag = 'body'
                        priortext = body.text
                    if priortext:
                        priortext = priortext.strip()
                    br.tag = XHTML('p')
                    br.text = u'\u00a0'
                    style = br.get('style', '').split(';')
                    style = filter(None, map(lambda x: x.strip(), style))
                    style.append('margin:0pt; border:0pt')
                    # If the prior tag is a block (including a <br> we replaced)
                    # then this <br> replacement should have a 1-line height.
                    # Otherwise it should have no height.
                    if not priortext and priortag in block_level_tags:
                        style.append('height:1em')
                    else:
                        style.append('height:0pt')
                    br.set('style', '; '.join(style))

            for tag in XPath('//h:embed')(root):
                tag.getparent().remove(tag)
            for tag in XPath('//h:object')(root):
                if tag.get('type', '').lower().strip() in {'image/svg+xml', 'application/svg+xml'}:
                    continue
                tag.getparent().remove(tag)

            for tag in XPath('//h:title|//h:style')(root):
                if not tag.text:
                    tag.getparent().remove(tag)
            for tag in XPath('//h:script')(root):
                if (not tag.text and not tag.get('src', False) and tag.get('type', None) != 'text/x-mathjax-config'):
                    tag.getparent().remove(tag)
            for tag in XPath('//h:body/descendant::h:script')(root):
                tag.getparent().remove(tag)

            formchildren = XPath('./h:input|./h:button|./h:textarea|'
                    './h:label|./h:fieldset|./h:legend')
            for tag in XPath('//h:form')(root):
                if formchildren(tag):
                    tag.getparent().remove(tag)
                else:
                    # Not a real form
                    tag.tag = XHTML('div')

            for tag in XPath('//h:center')(root):
                tag.tag = XHTML('div')
                tag.set('style', 'text-align:center')
            # ADE can't handle &amp; in an img url
            for tag in XPath('//h:img[@src]')(root):
                tag.set('src', tag.get('src', '').replace('&', ''))

            # ADE whimpers in fright when it encounters a <td> outside a
            # <table>
            in_table = XPath('ancestor::h:table')
            for tag in XPath('//h:td|//h:tr|//h:th')(root):
                if not in_table(tag):
                    tag.tag = XHTML('div')

            # ADE fails to render non breaking hyphens/soft hyphens/zero width spaces
            special_chars = re.compile(u'[\u200b\u00ad]')
            for elem in root.iterdescendants('*'):
                if elem.text:
                    elem.text = special_chars.sub('', elem.text)
                    elem.text = elem.text.replace(u'\u2011', '-')
                if elem.tail:
                    elem.tail = special_chars.sub('', elem.tail)
                    elem.tail = elem.tail.replace(u'\u2011', '-')

            if stylesheet is not None:
                # ADE doesn't render lists correctly if they have left margins
                from cssutils.css import CSSRule
                for lb in XPath('//h:ul[@class]|//h:ol[@class]')(root):
                    sel = '.'+lb.get('class')
                    for rule in stylesheet.data.cssRules.rulesOfType(CSSRule.STYLE_RULE):
                        if sel == rule.selectorList.selectorText:
                            rule.style.removeProperty('margin-left')
                            # padding-left breaks rendering in webkit and gecko
                            rule.style.removeProperty('padding-left')
                # Change whitespace:pre to pre-wrap to accommodate readers that
                # cannot scroll horizontally
                for rule in stylesheet.data.cssRules.rulesOfType(CSSRule.STYLE_RULE):
                    style = rule.style
                    ws = style.getPropertyValue('white-space')
                    if ws == 'pre':
                        style.setProperty('white-space', 'pre-wrap')
Ejemplo n.º 29
0
    def workaround_ade_quirks(self):  # {{{
        '''
        Perform various markup transforms to get the output to render correctly
        in the quirky ADE.
        '''
        from calibre.ebooks.oeb.base import XPath, XHTML, barename, urlunquote

        stylesheet = self.oeb.manifest.main_stylesheet

        # ADE cries big wet tears when it encounters an invalid fragment
        # identifier in the NCX toc.
        frag_pat = re.compile(r'[-A-Za-z0-9_:.]+$')
        for node in self.oeb.toc.iter():
            href = getattr(node, 'href', None)
            if hasattr(href, 'partition'):
                base, _, frag = href.partition('#')
                frag = urlunquote(frag)
                if frag and frag_pat.match(frag) is None:
                    self.log.warn(
                        'Removing fragment identifier %r from TOC as Adobe Digital Editions cannot handle it'
                        % frag)
                    node.href = base

        for x in self.oeb.spine:
            root = x.data
            body = XPath('//h:body')(root)
            if body:
                body = body[0]

            if hasattr(body, 'xpath'):
                # remove <img> tags with empty src elements
                bad = []
                for x in XPath('//h:img')(body):
                    src = x.get('src', '').strip()
                    if src in ('', '#') or src.startswith('http:'):
                        bad.append(x)
                for img in bad:
                    img.getparent().remove(img)

                # Add id attribute to <a> tags that have name
                for x in XPath('//h:a[@name]')(body):
                    if not x.get('id', False):
                        x.set('id', x.get('name'))
                    # The delightful epubcheck has started complaining about <a> tags that
                    # have name attributes.
                    x.attrib.pop('name')

                # Replace <br> that are children of <body> as ADE doesn't handle them
                for br in XPath('./h:br')(body):
                    if br.getparent() is None:
                        continue
                    try:
                        prior = next(br.itersiblings(preceding=True))
                        priortag = barename(prior.tag)
                        priortext = prior.tail
                    except:
                        priortag = 'body'
                        priortext = body.text
                    if priortext:
                        priortext = priortext.strip()
                    br.tag = XHTML('p')
                    br.text = '\u00a0'
                    style = br.get('style', '').split(';')
                    style = [_f for _f in [x.strip() for x in style] if _f]
                    style.append('margin:0pt; border:0pt')
                    # If the prior tag is a block (including a <br> we replaced)
                    # then this <br> replacement should have a 1-line height.
                    # Otherwise it should have no height.
                    if not priortext and priortag in block_level_tags:
                        style.append('height:1em')
                    else:
                        style.append('height:0pt')
                    br.set('style', '; '.join(style))

            for tag in XPath('//h:embed')(root):
                tag.getparent().remove(tag)
            for tag in XPath('//h:object')(root):
                if tag.get('type', '').lower().strip() in {
                        'image/svg+xml', 'application/svg+xml'
                }:
                    continue
                tag.getparent().remove(tag)

            for tag in XPath('//h:title|//h:style')(root):
                if not tag.text:
                    tag.getparent().remove(tag)
            for tag in XPath('//h:script')(root):
                if (not tag.text and not tag.get('src', False)
                        and tag.get('type', None) != 'text/x-mathjax-config'):
                    tag.getparent().remove(tag)
            for tag in XPath('//h:body/descendant::h:script')(root):
                tag.getparent().remove(tag)

            formchildren = XPath('./h:input|./h:button|./h:textarea|'
                                 './h:label|./h:fieldset|./h:legend')
            for tag in XPath('//h:form')(root):
                if formchildren(tag):
                    tag.getparent().remove(tag)
                else:
                    # Not a real form
                    tag.tag = XHTML('div')

            for tag in XPath('//h:center')(root):
                tag.tag = XHTML('div')
                tag.set('style', 'text-align:center')
            # ADE can't handle &amp; in an img url
            for tag in XPath('//h:img[@src]')(root):
                tag.set('src', tag.get('src', '').replace('&', ''))

            # ADE whimpers in fright when it encounters a <td> outside a
            # <table>
            in_table = XPath('ancestor::h:table')
            for tag in XPath('//h:td|//h:tr|//h:th')(root):
                if not in_table(tag):
                    tag.tag = XHTML('div')

            # ADE fails to render non breaking hyphens/soft hyphens/zero width spaces
            special_chars = re.compile('[\u200b\u00ad]')
            for elem in root.iterdescendants('*'):
                if elem.text:
                    elem.text = special_chars.sub('', elem.text)
                    elem.text = elem.text.replace('\u2011', '-')
                if elem.tail:
                    elem.tail = special_chars.sub('', elem.tail)
                    elem.tail = elem.tail.replace('\u2011', '-')

            if stylesheet is not None:
                # ADE doesn't render lists correctly if they have left margins
                from cssutils.css import CSSRule
                for lb in XPath('//h:ul[@class]|//h:ol[@class]')(root):
                    sel = '.' + lb.get('class')
                    for rule in stylesheet.data.cssRules.rulesOfType(
                            CSSRule.STYLE_RULE):
                        if sel == rule.selectorList.selectorText:
                            rule.style.removeProperty('margin-left')
                            # padding-left breaks rendering in webkit and gecko
                            rule.style.removeProperty('padding-left')
                # Change whitespace:pre to pre-wrap to accommodate readers that
                # cannot scroll horizontally
                for rule in stylesheet.data.cssRules.rulesOfType(
                        CSSRule.STYLE_RULE):
                    style = rule.style
                    ws = style.getPropertyValue('white-space')
                    if ws == 'pre':
                        style.setProperty('white-space', 'pre-wrap')
Ejemplo n.º 30
0
    def workaround_ade_quirks(self):  # {{{
        """
        Perform various markup transforms to get the output to render correctly
        in the quirky ADE.
        """
        from calibre.ebooks.oeb.base import XPath, XHTML, barename, urlunquote

        stylesheet = self.oeb.manifest.main_stylesheet

        # ADE cries big wet tears when it encounters an invalid fragment
        # identifier in the NCX toc.
        frag_pat = re.compile(r"[-A-Za-z0-9_:.]+$")
        for node in self.oeb.toc.iter():
            href = getattr(node, "href", None)
            if hasattr(href, "partition"):
                base, _, frag = href.partition("#")
                frag = urlunquote(frag)
                if frag and frag_pat.match(frag) is None:
                    self.log.warn("Removing invalid fragment identifier %r from TOC" % frag)
                    node.href = base

        for x in self.oeb.spine:
            root = x.data
            body = XPath("//h:body")(root)
            if body:
                body = body[0]

            if hasattr(body, "xpath"):
                # remove <img> tags with empty src elements
                bad = []
                for x in XPath("//h:img")(body):
                    src = x.get("src", "").strip()
                    if src in ("", "#") or src.startswith("http:"):
                        bad.append(x)
                for img in bad:
                    img.getparent().remove(img)

                # Add id attribute to <a> tags that have name
                for x in XPath("//h:a[@name]")(body):
                    if not x.get("id", False):
                        x.set("id", x.get("name"))
                    # The delightful epubcheck has started complaining about <a> tags that
                    # have name attributes.
                    x.attrib.pop("name")

                # Replace <br> that are children of <body> as ADE doesn't handle them
                for br in XPath("./h:br")(body):
                    if br.getparent() is None:
                        continue
                    try:
                        prior = br.itersiblings(preceding=True).next()
                        priortag = barename(prior.tag)
                        priortext = prior.tail
                    except:
                        priortag = "body"
                        priortext = body.text
                    if priortext:
                        priortext = priortext.strip()
                    br.tag = XHTML("p")
                    br.text = u"\u00a0"
                    style = br.get("style", "").split(";")
                    style = filter(None, map(lambda x: x.strip(), style))
                    style.append("margin:0pt; border:0pt")
                    # If the prior tag is a block (including a <br> we replaced)
                    # then this <br> replacement should have a 1-line height.
                    # Otherwise it should have no height.
                    if not priortext and priortag in block_level_tags:
                        style.append("height:1em")
                    else:
                        style.append("height:0pt")
                    br.set("style", "; ".join(style))

            for tag in XPath("//h:embed")(root):
                tag.getparent().remove(tag)
            for tag in XPath("//h:object")(root):
                if tag.get("type", "").lower().strip() in {"image/svg+xml", "application/svg+xml"}:
                    continue
                tag.getparent().remove(tag)

            for tag in XPath("//h:title|//h:style")(root):
                if not tag.text:
                    tag.getparent().remove(tag)
            for tag in XPath("//h:script")(root):
                if not tag.text and not tag.get("src", False) and tag.get("type", None) != "text/x-mathjax-config":
                    tag.getparent().remove(tag)
            for tag in XPath("//h:body/descendant::h:script")(root):
                tag.getparent().remove(tag)

            formchildren = XPath("./h:input|./h:button|./h:textarea|" "./h:label|./h:fieldset|./h:legend")
            for tag in XPath("//h:form")(root):
                if formchildren(tag):
                    tag.getparent().remove(tag)
                else:
                    # Not a real form
                    tag.tag = XHTML("div")

            for tag in XPath("//h:center")(root):
                tag.tag = XHTML("div")
                tag.set("style", "text-align:center")
            # ADE can't handle &amp; in an img url
            for tag in XPath("//h:img[@src]")(root):
                tag.set("src", tag.get("src", "").replace("&", ""))

            # ADE whimpers in fright when it encounters a <td> outside a
            # <table>
            in_table = XPath("ancestor::h:table")
            for tag in XPath("//h:td|//h:tr|//h:th")(root):
                if not in_table(tag):
                    tag.tag = XHTML("div")

            special_chars = re.compile(u"[\u200b\u00ad]")
            for elem in root.iterdescendants():
                if getattr(elem, "text", False):
                    elem.text = special_chars.sub("", elem.text)
                    elem.text = elem.text.replace(u"\u2011", "-")
                if getattr(elem, "tail", False):
                    elem.tail = special_chars.sub("", elem.tail)
                    elem.tail = elem.tail.replace(u"\u2011", "-")

            if stylesheet is not None:
                # ADE doesn't render lists correctly if they have left margins
                from cssutils.css import CSSRule

                for lb in XPath("//h:ul[@class]|//h:ol[@class]")(root):
                    sel = "." + lb.get("class")
                    for rule in stylesheet.data.cssRules.rulesOfType(CSSRule.STYLE_RULE):
                        if sel == rule.selectorList.selectorText:
                            rule.style.removeProperty("margin-left")
                            # padding-left breaks rendering in webkit and gecko
                            rule.style.removeProperty("padding-left")
                # Change whitespace:pre to pre-wrap to accommodate readers that
                # cannot scroll horizontally
                for rule in stylesheet.data.cssRules.rulesOfType(CSSRule.STYLE_RULE):
                    style = rule.style
                    ws = style.getPropertyValue("white-space")
                    if ws == "pre":
                        style.setProperty("white-space", "pre-wrap")
Ejemplo n.º 31
0
    def convert_epub3_nav(self, nav_path, opf, log, opts):
        from lxml import etree
        from calibre.ebooks.chardet import xml_to_unicode
        from calibre.ebooks.oeb.polish.parsing import parse
        from calibre.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, NCX, urlnormalize, urlunquote, serialize
        from calibre.ebooks.oeb.polish.toc import first_child
        from tempfile import NamedTemporaryFile
        with lopen(nav_path, 'rb') as f:
            raw = f.read()
        raw = xml_to_unicode(raw, strip_encoding_pats=True,
                             assume_utf8=True)[0]
        root = parse(raw, log=log)
        ncx = etree.fromstring(
            '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="eng"><navMap/></ncx>'
        )
        navmap = ncx[0]
        et = '{%s}type' % EPUB_NS
        bn = os.path.basename(nav_path)

        def add_from_li(li, parent):
            href = text = None
            for x in li.iterchildren(XHTML('a'), XHTML('span')):
                text = etree.tostring(
                    x, method='text', encoding=unicode,
                    with_tail=False).strip() or ' '.join(
                        x.xpath('descendant-or-self::*/@title')).strip()
                href = x.get('href')
                if href:
                    if href.startswith('#'):
                        href = bn + href
                break
            np = parent.makeelement(NCX('navPoint'))
            parent.append(np)
            np.append(np.makeelement(NCX('navLabel')))
            np[0].append(np.makeelement(NCX('text')))
            np[0][0].text = text
            if href:
                np.append(np.makeelement(NCX('content'), attrib={'src': href}))
            return np

        def process_nav_node(node, toc_parent):
            for li in node.iterchildren(XHTML('li')):
                child = add_from_li(li, toc_parent)
                ol = first_child(li, XHTML('ol'))
                if child is not None and ol is not None:
                    process_nav_node(ol, child)

        for nav in root.iterdescendants(XHTML('nav')):
            if nav.get(et) == 'toc':
                ol = first_child(nav, XHTML('ol'))
                if ol is not None:
                    process_nav_node(ol, navmap)
                    break
        else:
            return

        with NamedTemporaryFile(suffix='.ncx',
                                dir=os.path.dirname(nav_path),
                                delete=False) as f:
            f.write(etree.tostring(ncx, encoding='utf-8'))
        ncx_href = os.path.relpath(f.name, os.getcwdu()).replace(os.sep, '/')
        ncx_id = opf.create_manifest_item(ncx_href, NCX_MIME,
                                          append=True).get('id')
        for spine in opf.root.xpath('//*[local-name()="spine"]'):
            spine.set('toc', ncx_id)
        opts.epub3_nav_href = urlnormalize(
            os.path.relpath(nav_path).replace(os.sep, '/'))
        opts.epub3_nav_parsed = root
        if getattr(self, 'removed_cover', None):
            changed = False
            base_path = os.path.dirname(nav_path)
            for elem in root.xpath('//*[@href]'):
                href, frag = elem.get('href').partition('#')[::2]
                link_path = os.path.relpath(
                    os.path.join(base_path, urlunquote(href)), base_path)
                abs_href = urlnormalize(link_path)
                if abs_href == self.removed_cover:
                    changed = True
                    elem.set('data-calibre-removed-titlepage', '1')
            if changed:
                with open(nav_path, 'wb') as f:
                    f.write(serialize(root, 'application/xhtml+xml'))
Ejemplo n.º 32
0
def check_links(container):
    links_map = defaultdict(set)
    xml_types = {guess_type('a.opf'), guess_type('a.ncx')}
    errors = []
    a = errors.append

    def fl(x):
        x = repr(x)
        if x.startswith('u'):
            x = x[1:]
        return x

    for name, mt in container.mime_map.iteritems():
        if mt in OEB_DOCS or mt in OEB_STYLES or mt in xml_types:
            for href, lnum, col in container.iterlinks(name):
                if not href:
                    a(EmptyLink(_('The link is empty'), name, lnum, col))
                try:
                    tname = container.href_to_name(href, name)
                except ValueError:
                    tname = None  # Absolute paths to files on another drive in windows cause this
                if tname is not None:
                    if container.exists(tname):
                        if tname in container.mime_map:
                            links_map[name].add(tname)
                        else:
                            # Filesystem says the file exists, but it is not in
                            # the mime_map, so either there is a case mismatch
                            # or the link is a directory
                            apath = container.name_to_abspath(tname)
                            if os.path.isdir(apath):
                                a(
                                    BadLink(
                                        _('The linked resource %s is a directory'
                                          ) % fl(href), name, lnum, col))
                            else:
                                a(
                                    CaseMismatch(
                                        href,
                                        actual_case_for_name(container, tname),
                                        name, lnum, col))
                    else:
                        cname = corrected_case_for_name(container, tname)
                        if cname is not None:
                            a(CaseMismatch(href, cname, name, lnum, col))
                        else:
                            a(
                                DanglingLink(
                                    _('The linked resource %s does not exist')
                                    % fl(href), tname, name, lnum, col))
                else:
                    purl = urlparse(href)
                    if purl.scheme == 'file':
                        a(
                            FileLink(
                                _('The link %s is a file:// URL') % fl(href),
                                name, lnum, col))
                    elif purl.path and purl.path.startswith(
                            '/') and purl.scheme in {'', 'file'}:
                        a(
                            LocalLink(
                                _('The link %s points to a file outside the book'
                                  ) % fl(href), name, lnum, col))
                    elif purl.path and purl.scheme in {
                            '', 'file'
                    } and ':' in urlunquote(purl.path):
                        a(
                            InvalidCharInLink(
                                _('The link %s contains a : character, this will cause errors on Windows computers'
                                  ) % fl(href), name, lnum, col))

    spine_docs = {name for name, linear in container.spine_names}
    spine_styles = {
        tname
        for name in spine_docs for tname in links_map[name]
        if container.mime_map.get(tname, None) in OEB_STYLES
    }
    num = -1
    while len(spine_styles) > num:
        # Handle import rules in stylesheets
        num = len(spine_styles)
        spine_styles |= {
            tname
            for name in spine_styles for tname in links_map[name]
            if container.mime_map.get(tname, None) in OEB_STYLES
        }
    seen = set(OEB_DOCS) | set(OEB_STYLES)
    spine_resources = {
        tname
        for name in spine_docs | spine_styles for tname in links_map[name]
        if container.mime_map[tname] not in seen
    }
    unreferenced = set()

    cover_name = container.guide_type_map.get('cover', None)
    nav_items = frozenset(container.manifest_items_with_property('nav'))

    for name, mt in container.mime_map.iteritems():
        if mt in OEB_STYLES and name not in spine_styles:
            a(UnreferencedResource(name))
        elif mt in OEB_DOCS and name not in spine_docs and name not in nav_items:
            a(UnreferencedDoc(name))
        elif (mt in OEB_FONTS or mt.partition('/')[0] in {
                'image', 'audio', 'video'
        }) and name not in spine_resources and name != cover_name:
            if mt.partition('/')[
                    0] == 'image' and name == get_raster_cover_name(container):
                continue
            a(UnreferencedResource(name))
        else:
            continue
        unreferenced.add(name)

    manifest_names = set(container.manifest_id_map.itervalues())
    for name in container.mime_map:
        if name not in manifest_names and not container.ok_to_be_unmanifested(
                name):
            a(Unmanifested(name, unreferenced=name in unreferenced))
        if name == 'META-INF/calibre_bookmarks.txt':
            a(Bookmarks(name))

    return errors
Ejemplo n.º 33
0
    def create_image_markup(self, html_img, stylizer, href, as_block=False):
        # TODO: img inside a link (clickable image)
        style = stylizer.style(html_img)
        floating = style['float']
        if floating not in {'left', 'right'}:
            floating = None
        if as_block:
            ml, mr = style._get('margin-left'), style._get('margin-right')
            if ml == 'auto':
                floating = 'center' if mr == 'auto' else 'right'
            if mr == 'auto':
                floating = 'center' if ml == 'auto' else 'right'
        else:
            parent = html_img.getparent()
            if len(parent) == 1 and not (parent.text or '').strip() and not (
                    html_img.tail or '').strip():
                pstyle = stylizer.style(parent)
                if 'block' in pstyle['display']:
                    # We have an inline image alone inside a block
                    as_block = True
                    floating = pstyle['float']
                    if floating not in {'left', 'right'}:
                        floating = None
                        if pstyle['text-align'] in ('center', 'right'):
                            floating = pstyle['text-align']
                    floating = floating or 'left'
        fake_margins = floating is None
        self.count += 1
        img = self.images[href]
        name = urlunquote(posixpath.basename(href))
        width, height = style.img_size(img.width, img.height)
        scaled, width, height = fit_image(width, height, self.page_width,
                                          self.page_height)
        width, height = map(pt_to_emu, (width, height))

        makeelement, namespaces = self.document_relationships.namespace.makeelement, self.document_relationships.namespace.namespaces

        root = etree.Element('root', nsmap=namespaces)
        ans = makeelement(root, 'w:drawing', append=False)
        if floating is None:
            parent = makeelement(ans, 'wp:inline')
        else:
            parent = makeelement(ans, 'wp:anchor', **get_image_margins(style))
            # The next three lines are boilerplate that Word requires, even
            # though the DOCX specs define defaults for all of them
            parent.set('simplePos',
                       '0'), parent.set('relativeHeight', '1'), parent.set(
                           'behindDoc', "0"), parent.set('locked', "0")
            parent.set('layoutInCell', "1"), parent.set('allowOverlap', '1')
            makeelement(parent, 'wp:simplePos', x='0', y='0')
            makeelement(
                makeelement(parent, 'wp:positionH', relativeFrom='margin'),
                'wp:align').text = floating
            makeelement(
                makeelement(parent, 'wp:positionV', relativeFrom='line'),
                'wp:align').text = 'top'
        makeelement(parent, 'wp:extent', cx=str(width), cy=str(height))
        if fake_margins:
            # DOCX does not support setting margins for inline images, so we
            # fake it by using effect extents to simulate margins
            makeelement(
                parent, 'wp:effectExtent', **{
                    k[-1].lower(): v
                    for k, v in get_image_margins(style).iteritems()
                })
        else:
            makeelement(parent, 'wp:effectExtent', l='0', r='0', t='0', b='0')
        if floating is not None:
            # The idiotic Word requires this to be after the extent settings
            if as_block:
                makeelement(parent, 'wp:wrapTopAndBottom')
            else:
                makeelement(parent, 'wp:wrapSquare', wrapText='bothSides')
        self.create_docx_image_markup(parent, name,
                                      html_img.get('alt') or name, img.rid,
                                      width, height)
        return ans