Exemple #1
0
 def _toc_from_html(self, opf):
     if 'toc' not in self.oeb.guide:
         return False
     self.log.debug('Reading TOC from HTML...')
     itempath, frag = urldefrag(self.oeb.guide['toc'].href)
     item = self.oeb.manifest.hrefs[itempath]
     html = item.data
     if frag:
         elems = xpath(html, './/*[@id="%s"]' % frag)
         if not elems:
             elems = xpath(html, './/*[@name="%s"]' % frag)
         elem = elems[0] if elems else html
         while elem != html and not xpath(elem, './/h:a[@href]'):
             elem = elem.getparent()
         html = elem
     titles = defaultdict(list)
     order = []
     for anchor in xpath(html, './/h:a[@href]'):
         href = anchor.attrib['href']
         href = item.abshref(urlnormalize(href))
         path, frag = urldefrag(href)
         if path not in self.oeb.manifest.hrefs:
             continue
         title = xml2text(anchor)
         title = COLLAPSE_RE.sub(' ', title.strip())
         if href not in titles:
             order.append(href)
         titles[href].append(title)
     toc = self.oeb.toc
     for href in order:
         toc.add(' '.join(titles[href]), href)
     return True
Exemple #2
0
 def _toc_from_spine(self, opf):
     self.log.warn('Generating default TOC from spine...')
     toc = self.oeb.toc
     titles = []
     headers = []
     for item in self.oeb.spine:
         if not item.linear:
             continue
         html = item.data
         title = ''.join(xpath(html, '/h:html/h:head/h:title/text()'))
         title = COLLAPSE_RE.sub(' ', title.strip())
         if title:
             titles.append(title)
         headers.append('(unlabled)')
         for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'):
             expr = '/h:html/h:body//h:%s[position()=1]/text()'
             header = ''.join(xpath(html, expr % tag))
             header = COLLAPSE_RE.sub(' ', header.strip())
             if header:
                 headers[-1] = header
                 break
     use = titles
     if len(titles) > len(set(titles)):
         use = headers
     for title, item in izip(use, self.oeb.spine):
         if not item.linear:
             continue
         toc.add(title, item.href)
     return True
Exemple #3
0
 def _clean_opf(self, opf):
     nsmap = {}
     for elem in opf.iter(tag=etree.Element):
         nsmap.update(elem.nsmap)
     for elem in opf.iter(tag=etree.Element):
         if namespace(elem.tag) in ('', OPF1_NS) and ':' not in barename(elem.tag):
             elem.tag = OPF(barename(elem.tag))
     nsmap.update(OPF2_NSMAP)
     attrib = dict(opf.attrib)
     nroot = etree.Element(OPF('package'),
         nsmap={None: OPF2_NS}, attrib=attrib)
     metadata = etree.SubElement(nroot, OPF('metadata'), nsmap=nsmap)
     ignored = (OPF('dc-metadata'), OPF('x-metadata'))
     for elem in xpath(opf, 'o2:metadata//*'):
         if elem.tag in ignored:
             continue
         if namespace(elem.tag) in DC_NSES:
             tag = barename(elem.tag).lower()
             elem.tag = '{%s}%s' % (DC11_NS, tag)
         if elem.tag.startswith('dc:'):
             tag = elem.tag.partition(':')[-1].lower()
             elem.tag = '{%s}%s' % (DC11_NS, tag)
         metadata.append(elem)
     for element in xpath(opf, 'o2:metadata//o2:meta'):
         metadata.append(element)
     for tag in ('o2:manifest', 'o2:spine', 'o2:tours', 'o2:guide'):
         for element in xpath(opf, tag):
             nroot.append(element)
     return nroot
Exemple #4
0
    def _toc_from_navpoint(self, item, toc, navpoint):
        children = xpath(navpoint, 'ncx:navPoint')
        for child in children:
            title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()'))
            title = COLLAPSE_RE.sub(' ', title.strip())
            href = xpath(child, 'ncx:content/@src')
            if not title:
                self._toc_from_navpoint(item, toc, child)
                continue
            if not href:
                gc = xpath(child, 'ncx:navPoint')
                if not gc:
                    # This node is useless
                    continue
                href = 'missing.html'

            href = item.abshref(urlnormalize(href[0]))
            path, _ = urldefrag(href)
            if path not in self.oeb.manifest.hrefs:
                self.logger.warn('TOC reference %r not found' % href)
                gc = xpath(child, 'ncx:navPoint')
                if not gc:
                    # This node is useless
                    continue
            id = child.get('id')
            klass = child.get('class', 'chapter')

            try:
                po = int(child.get('playOrder', self.oeb.toc.next_play_order()))
            except:
                po = self.oeb.toc.next_play_order()

            authorElement = xpath(child,
                    'descendant::calibre:meta[@name = "author"]')
            if authorElement :
                author = authorElement[0].text
            else :
                author = None

            descriptionElement = xpath(child,
                    'descendant::calibre:meta[@name = "description"]')
            if descriptionElement:
                description = etree.tostring(descriptionElement[0],
                method='text', encoding=unicode).strip()
                if not description:
                    description = None
            else :
                description = None

            index_image = xpath(child,
                    'descendant::calibre:meta[@name = "toc_thumbnail"]')
            toc_thumbnail = (index_image[0].text if index_image else None)
            if not toc_thumbnail or not toc_thumbnail.strip():
                toc_thumbnail = None

            node = toc.add(title, href, id=id, klass=klass,
                    play_order=po, description=description, author=author,
                           toc_thumbnail=toc_thumbnail)

            self._toc_from_navpoint(item, node, child)
Exemple #5
0
 def _toc_from_ncx(self, item):
     if (item is None) or (item.data is None):
         return False
     self.log.debug('Reading TOC from NCX...')
     ncx = item.data
     title = ''.join(xpath(ncx, 'ncx:docTitle/ncx:text/text()'))
     title = COLLAPSE_RE.sub(' ', title.strip())
     title = title or unicode(self.oeb.metadata.title[0])
     toc = self.oeb.toc
     toc.title = title
     navmaps = xpath(ncx, 'ncx:navMap')
     for navmap in navmaps:
         self._toc_from_navpoint(item, toc, navmap)
     return True
Exemple #6
0
 def _manifest_from_opf(self, opf):
     manifest = self.oeb.manifest
     for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'):
         id = elem.get('id')
         href = elem.get('href')
         media_type = elem.get('media-type', None)
         if media_type is None:
             media_type = elem.get('mediatype', None)
         if not media_type or media_type == 'text/xml':
             guessed = guess_type(href)[0]
             media_type = guessed or media_type or BINARY_MIME
         if hasattr(media_type, 'lower'):
             media_type = media_type.lower()
         fallback = elem.get('fallback')
         if href in manifest.hrefs:
             self.logger.warn(u'Duplicate manifest entry for %r' % href)
             continue
         if not self.oeb.container.exists(href):
             self.logger.warn(u'Manifest item %r not found' % href)
             continue
         if id in manifest.ids:
             self.logger.warn(u'Duplicate manifest id %r' % id)
             id, href = manifest.generate(id, href)
         manifest.add(id, href, media_type, fallback)
     invalid = self._manifest_prune_invalid()
     self._manifest_add_missing(invalid)
Exemple #7
0
 def _manifest_from_opf(self, opf):
     manifest = self.oeb.manifest
     for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'):
         id = elem.get('id')
         href = elem.get('href')
         media_type = elem.get('media-type', None)
         if media_type is None:
             media_type = elem.get('mediatype', None)
         if not media_type or media_type == 'text/xml':
             guessed = guess_type(href)[0]
             media_type = guessed or media_type or BINARY_MIME
         if hasattr(media_type, 'lower'):
             media_type = media_type.lower()
         fallback = elem.get('fallback')
         if href in manifest.hrefs:
             self.logger.warn('Duplicate manifest entry for %r' % href)
             continue
         if not self.oeb.container.exists(href):
             self.logger.warn('Manifest item %r not found' % href)
             continue
         if id in manifest.ids:
             self.logger.warn('Duplicate manifest id %r' % id)
             id, href = manifest.generate(id, href)
         manifest.add(id, href, media_type, fallback)
     invalid = self._manifest_prune_invalid()
     self._manifest_add_missing(invalid)
Exemple #8
0
    def _toc_from_navpoint(self, item, toc, navpoint):
        children = xpath(navpoint, 'ncx:navPoint')
        for child in children:
            title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()'))
            title = COLLAPSE_RE.sub(' ', title.strip())
            href = xpath(child, 'ncx:content/@src')
            if not title:
                self._toc_from_navpoint(item, toc, child)
                continue
            if (not href or not href[0]) and not xpath(child, 'ncx:navPoint'):
                # This node is useless
                continue
            href = item.abshref(urlnormalize(href[0])) if href and href[0] else ''
            path, _ = urldefrag(href)
            if href and path not in self.oeb.manifest.hrefs:
                self.logger.warn('TOC reference %r not found' % href)
                gc = xpath(child, 'ncx:navPoint')
                if not gc:
                    # This node is useless
                    continue
            id = child.get('id')
            klass = child.get('class', 'chapter')

            try:
                po = int(child.get('playOrder', self.oeb.toc.next_play_order()))
            except:
                po = self.oeb.toc.next_play_order()

            authorElement = xpath(child,
                    'descendant::calibre:meta[@name = "author"]')
            if authorElement:
                author = authorElement[0].text
            else:
                author = None

            descriptionElement = xpath(child,
                    'descendant::calibre:meta[@name = "description"]')
            if descriptionElement:
                description = etree.tostring(descriptionElement[0],
                method='text', encoding=unicode).strip()
                if not description:
                    description = None
            else:
                description = None

            index_image = xpath(child,
                    'descendant::calibre:meta[@name = "toc_thumbnail"]')
            toc_thumbnail = (index_image[0].text if index_image else None)
            if not toc_thumbnail or not toc_thumbnail.strip():
                toc_thumbnail = None

            node = toc.add(title, href, id=id, klass=klass,
                    play_order=po, description=description, author=author,
                           toc_thumbnail=toc_thumbnail)

            self._toc_from_navpoint(item, node, child)
Exemple #9
0
 def rasterize_item(self, item):
     html = item.data
     hrefs = self.oeb.manifest.hrefs
     for elem in xpath(html, '//h:img[@src]'):
         src = urlnormalize(elem.attrib['src'])
         image = hrefs.get(item.abshref(src), None)
         if image and image.media_type == SVG_MIME:
             style = self.stylizer(item).style(elem)
             self.rasterize_external(elem, style, item, image)
     for elem in xpath(html, '//h:object[@type="%s" and @data]' % SVG_MIME):
         data = urlnormalize(elem.attrib['data'])
         image = hrefs.get(item.abshref(data), None)
         if image and image.media_type == SVG_MIME:
             style = self.stylizer(item).style(elem)
             self.rasterize_external(elem, style, item, image)
     for elem in xpath(html, '//svg:svg'):
         style = self.stylizer(item).style(elem)
         self.rasterize_inline(elem, style, item)
Exemple #10
0
 def rasterize_item(self, item):
     html = item.data
     hrefs = self.oeb.manifest.hrefs
     for elem in xpath(html, '//h:img[@src]'):
         src = urlnormalize(elem.attrib['src'])
         image = hrefs.get(item.abshref(src), None)
         if image and image.media_type == SVG_MIME:
             style = self.stylizer(item).style(elem)
             self.rasterize_external(elem, style, item, image)
     for elem in xpath(html, '//h:object[@type="%s" and @data]' % SVG_MIME):
         data = urlnormalize(elem.attrib['data'])
         image = hrefs.get(item.abshref(data), None)
         if image and image.media_type == SVG_MIME:
             style = self.stylizer(item).style(elem)
             self.rasterize_external(elem, style, item, image)
     for elem in xpath(html, '//svg:svg'):
         style = self.stylizer(item).style(elem)
         self.rasterize_inline(elem, style, item)
Exemple #11
0
def collect_properties(container):
    for item in container.opf_xpath('//opf:manifest/opf:item[@href and @media-type]'):
        mt = item.get('media-type') or ''
        if mt.lower() not in OEB_DOCS:
            continue
        name = container.href_to_name(item.get('href'), container.opf_name)
        root = container.parsed(name)
        root = ensure_namespace_prefixes(root, {'epub': EPUB_NS})
        properties = set()
        container.replace(name, root)  # Ensure entities are converted
        if xpath(root, '//svg:svg'):
            properties.add('svg')
        if xpath(root, '//h:script'):
            properties.add('scripted')
        if xpath(root, '//mathml:math'):
            properties.add('mathml')
        if xpath(root, '//epub:switch'):
            properties.add('switch')
        if properties:
            add_properties(item, *tuple(properties))
Exemple #12
0
def collect_properties(container):
    for item in container.opf_xpath(
            '//opf:manifest/opf:item[@href and @media-type]'):
        mt = item.get('media-type') or ''
        if mt.lower() not in OEB_DOCS:
            continue
        name = container.href_to_name(item.get('href'), container.opf_name)
        root = container.parsed(name)
        properties = set()
        container.dirty(name)  # Ensure entities are converted
        if xpath(root, '//svg:svg'):
            properties.add('svg')
        if xpath(root, '//h:script'):
            properties.add('scripted')
        if xpath(root, '//mathml:math'):
            properties.add('mathml')
        if xpath(root, '//epub:switch'):
            properties.add('switch')
        if properties:
            add_properties(item, *tuple(properties))
Exemple #13
0
 def _spine_from_opf(self, opf):
     spine = self.oeb.spine
     manifest = self.oeb.manifest
     for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'):
         idref = elem.get('idref')
         if idref not in manifest.ids:
             self.logger.warn(u'Spine item %r not found' % idref)
             continue
         item = manifest.ids[idref]
         if item.media_type.lower() in OEB_DOCS and hasattr(item.data, 'xpath'):
             spine.add(item, elem.get('linear'))
         else:
             self.oeb.log.warn('The item %s is not a XML document.'
                     ' Removing it from spine.'%item.href)
     if len(spine) == 0:
         raise OEBError("Spine is empty")
     self._spine_add_extra()
     for val in xpath(opf, '/o2:package/o2:spine/@page-progression-direction'):
         if val in {'ltr', 'rtl'}:
             spine.page_progression_direction = val
Exemple #14
0
 def _spine_from_opf(self, opf):
     manifest = self.oeb.manifest
     for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'):
         idref = elem.get('idref')
         if idref not in manifest.ids:
             continue
         item = manifest.ids[idref]
         if (item.media_type.lower() == 'application/xml' and
             hasattr(item.data, 'xpath') and item.data.xpath('/html')):
             item.media_type = 'application/xhtml+xml'
             item.data = item._parse_xhtml(etree.tostring(item.data))
     super(LitReader, self)._spine_from_opf(opf)
Exemple #15
0
 def _toc_from_tour(self, opf):
     result = xpath(opf, 'o2:tours/o2:tour')
     if not result:
         return False
     self.log.debug('Reading TOC from tour...')
     tour = result[0]
     toc = self.oeb.toc
     toc.title = tour.get('title')
     sites = xpath(tour, 'o2:site')
     for site in sites:
         title = site.get('title')
         href = site.get('href')
         if not title or not href:
             continue
         path, _ = urldefrag(urlnormalize(href))
         if path not in self.oeb.manifest.hrefs:
             self.logger.warn('TOC reference %r not found' % href)
             continue
         id = site.get('id')
         toc.add(title, href, id=id)
     return True
Exemple #16
0
 def _spine_from_opf(self, opf):
     manifest = self.oeb.manifest
     for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'):
         idref = elem.get('idref')
         if idref not in manifest.ids:
             continue
         item = manifest.ids[idref]
         if (item.media_type.lower() == 'application/xml' and
             hasattr(item.data, 'xpath') and item.data.xpath('/html')):
             item.media_type = 'application/xhtml+xml'
             item.data = item._parse_xhtml(etree.tostring(item.data))
     super(LitReader, self)._spine_from_opf(opf)
Exemple #17
0
 def _pages_from_ncx(self, opf, item):
     if item is None:
         return False
     ncx = item.data
     if ncx is None:
         return False
     ptargets = xpath(ncx, 'ncx:pageList/ncx:pageTarget')
     if not ptargets:
         return False
     pages = self.oeb.pages
     for ptarget in ptargets:
         name = ''.join(xpath(ptarget, 'ncx:navLabel/ncx:text/text()'))
         name = COLLAPSE_RE.sub(' ', name.strip())
         href = xpath(ptarget, 'ncx:content/@src')
         if not href:
             continue
         href = item.abshref(urlnormalize(href[0]))
         id = ptarget.get('id')
         type = ptarget.get('type', 'normal')
         klass = ptarget.get('class')
         pages.add(name, href, type=type, id=id, klass=klass)
     return True
Exemple #18
0
 def _find_page_map(self, opf):
     result = xpath(opf, '/o2:package/o2:spine/@page-map')
     if result:
         id = result[0]
         if id not in self.oeb.manifest.ids:
             return None
         item = self.oeb.manifest.ids[id]
         self.oeb.manifest.remove(item)
         return item
     for item in self.oeb.manifest.values():
         if item.media_type == PAGE_MAP_MIME:
             self.oeb.manifest.remove(item)
             return item
     return None
Exemple #19
0
 def _find_ncx(self, opf):
     result = xpath(opf, '/o2:package/o2:spine/@toc')
     if result:
         id = result[0]
         if id not in self.oeb.manifest.ids:
             return None
         item = self.oeb.manifest.ids[id]
         self.oeb.manifest.remove(item)
         return item
     for item in list(self.oeb.manifest.values()):
         if item.media_type == NCX_MIME:
             self.oeb.manifest.remove(item)
             return item
     return None
Exemple #20
0
 def dataize_svg(self, item, svg=None):
     if svg is None:
         svg = item.data
     hrefs = self.oeb.manifest.hrefs
     for elem in xpath(svg, '//svg:*[@xl:href]'):
         href = urlnormalize(elem.attrib[XLINK('href')])
         path = urldefrag(href)[0]
         if not path:
             continue
         abshref = item.abshref(path)
         if abshref not in hrefs:
             continue
         linkee = hrefs[abshref]
         data = base64.encodestring(str(linkee))
         data = "data:%s;base64,%s" % (linkee.media_type, data)
         elem.attrib[XLINK('href')] = data
     return svg
Exemple #21
0
 def dataize_svg(self, item, svg=None):
     if svg is None:
         svg = item.data
     hrefs = self.oeb.manifest.hrefs
     for elem in xpath(svg, '//svg:*[@xl:href]'):
         href = urlnormalize(elem.attrib[XLINK('href')])
         path = urldefrag(href)[0]
         if not path:
             continue
         abshref = item.abshref(path)
         if abshref not in hrefs:
             continue
         linkee = hrefs[abshref]
         data = base64.encodestring(str(linkee))
         data = "data:%s;base64,%s" % (linkee.media_type, data)
         elem.attrib[XLINK('href')] = data
     return svg
Exemple #22
0
 def _spine_from_opf(self, opf):
     spine = self.oeb.spine
     manifest = self.oeb.manifest
     for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'):
         idref = elem.get('idref')
         if idref not in manifest.ids:
             self.logger.warn(u'Spine item %r not found' % idref)
             continue
         item = manifest.ids[idref]
         spine.add(item, elem.get('linear'))
     for item in spine:
         if item.media_type.lower() not in OEB_DOCS:
             if not hasattr(item.data, 'xpath'):
                 self.oeb.log.warn('The item %s is not a XML document.'
                         ' Removing it from spine.'%item.href)
                 spine.remove(item)
     if len(spine) == 0:
         raise OEBError("Spine is empty")
     self._spine_add_extra()
 def _spine_from_opf(self, opf):
     spine = self.oeb.spine
     manifest = self.oeb.manifest
     for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'):
         idref = elem.get('idref')
         if idref not in manifest.ids:
             self.logger.warn(u'Spine item %r not found' % idref)
             continue
         item = manifest.ids[idref]
         spine.add(item, elem.get('linear'))
     for item in spine:
         if item.media_type.lower() not in OEB_DOCS:
             if not hasattr(item.data, 'xpath'):
                 self.oeb.log.warn('The item %s is not a XML document.'
                                   ' Removing it from spine.' % item.href)
                 spine.remove(item)
     if len(spine) == 0:
         raise OEBError("Spine is empty")
     self._spine_add_extra()
Exemple #24
0
 def _guide_from_opf(self, opf):
     guide = self.oeb.guide
     manifest = self.oeb.manifest
     for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'):
         ref_href = elem.get('href')
         path = urlnormalize(urldefrag(ref_href)[0])
         if path not in manifest.hrefs:
             corrected_href = None
             for href in manifest.hrefs:
                 if href.lower() == path.lower():
                     corrected_href = href
                     break
             if corrected_href is None:
                 self.logger.warn(u'Guide reference %r not found' % ref_href)
                 continue
             ref_href = corrected_href
         typ = elem.get('type')
         if typ not in guide:
             guide.add(typ, elem.get('title'), ref_href)
Exemple #25
0
 def _pages_from_page_map(self, opf):
     item = self._find_page_map(opf)
     if item is None:
         return False
     pmap = item.data
     pages = self.oeb.pages
     for page in xpath(pmap, 'o2:page'):
         name = page.get('name', '')
         href = page.get('href')
         if not href:
             continue
         name = COLLAPSE_RE.sub(' ', name.strip())
         href = item.abshref(urlnormalize(href))
         type = 'normal'
         if not name:
             type = 'special'
         elif name.lower().strip('ivxlcdm') == '':
             type = 'front'
         pages.add(name, href, type=type)
     return True
Exemple #26
0
 def dataize_svg(self, item, svg=None):
     if svg is None:
         svg = item.data
     hrefs = self.oeb.manifest.hrefs
     for elem in xpath(svg, '//svg:*[@xl:href]'):
         href = urlnormalize(elem.attrib[XLINK('href')])
         path = urldefrag(href)[0]
         if not path:
             continue
         abshref = item.abshref(path)
         if abshref not in hrefs:
             continue
         linkee = hrefs[abshref]
         data = str(linkee)
         ext = what(None, data) or 'jpg'
         with PersistentTemporaryFile(suffix='.'+ext) as pt:
             pt.write(data)
             self.temp_files.append(pt.name)
         elem.attrib[XLINK('href')] = pt.name
     return svg
Exemple #27
0
 def dataize_svg(self, item, svg=None):
     if svg is None:
         svg = item.data
     hrefs = self.oeb.manifest.hrefs
     for elem in xpath(svg, '//svg:*[@xl:href]'):
         href = urlnormalize(elem.attrib[XLINK('href')])
         path = urldefrag(href)[0]
         if not path:
             continue
         abshref = item.abshref(path)
         if abshref not in hrefs:
             continue
         linkee = hrefs[abshref]
         data = str(linkee)
         ext = what(None, data) or 'jpg'
         with PersistentTemporaryFile(suffix='.' + ext) as pt:
             pt.write(data)
             self.temp_files.append(pt.name)
         elem.attrib[XLINK('href')] = pt.name
     return svg
Exemple #28
0
    def create_oebbook(self, htmlpath, basedir, opts, log, mi):
        import uuid
        from calibre.ebooks.conversion.plumber import create_oebbook
        from calibre.ebooks.oeb.base import (DirContainer,
            rewrite_links, urlnormalize, urldefrag, BINARY_MIME, OEB_STYLES,
            xpath)
        from calibre import guess_type
        from calibre.ebooks.oeb.transforms.metadata import \
            meta_info_to_oeb_metadata
        from calibre.ebooks.html.input import get_filelist
        from calibre.ebooks.metadata import string_to_authors
        from calibre.utils.localization import canonicalize_lang
        import cssutils, logging
        cssutils.log.setLevel(logging.WARN)
        self.OEB_STYLES = OEB_STYLES
        oeb = create_oebbook(log, None, opts, self,
                encoding=opts.input_encoding, populate=False)
        self.oeb = oeb

        metadata = oeb.metadata
        meta_info_to_oeb_metadata(mi, metadata, log)
        if not metadata.language:
            l = canonicalize_lang(getattr(opts, 'language', None))
            if not l:
                oeb.logger.warn(u'Language not specified')
                l = get_lang().replace('_', '-')
            metadata.add('language', l)
        if not metadata.creator:
            a = getattr(opts, 'authors', None)
            if a:
                a = string_to_authors(a)
            if not a:
                oeb.logger.warn('Creator not specified')
                a = [self.oeb.translate(__('Unknown'))]
            for aut in a:
                metadata.add('creator', aut)
        if not metadata.title:
            oeb.logger.warn('Title not specified')
            metadata.add('title', self.oeb.translate(__('Unknown')))
        bookid = str(uuid.uuid4())
        metadata.add('identifier', bookid, id='uuid_id', scheme='uuid')
        for ident in metadata.identifier:
            if 'id' in ident.attrib:
                self.oeb.uid = metadata.identifier[0]
                break

        filelist = get_filelist(htmlpath, basedir, opts, log)
        filelist = [f for f in filelist if not f.is_binary]
        htmlfile_map = {}
        for f in filelist:
            path = f.path
            oeb.container = DirContainer(os.path.dirname(path), log,
                    ignore_opf=True)
            bname = os.path.basename(path)
            id, href = oeb.manifest.generate(id='html',
                    href=ascii_filename(bname))
            htmlfile_map[path] = href
            item = oeb.manifest.add(id, href, 'text/html')
            item.html_input_href = bname
            oeb.spine.add(item, True)

        self.added_resources = {}
        self.log = log
        self.log('Normalizing filename cases')
        for path, href in htmlfile_map.items():
            if not self.is_case_sensitive(path):
                path = path.lower()
            self.added_resources[path] = href
        self.urlnormalize, self.DirContainer = urlnormalize, DirContainer
        self.urldefrag = urldefrag
        self.guess_type, self.BINARY_MIME = guess_type, BINARY_MIME

        self.log('Rewriting HTML links')
        for f in filelist:
            path = f.path
            dpath = os.path.dirname(path)
            oeb.container = DirContainer(dpath, log, ignore_opf=True)
            item = oeb.manifest.hrefs[htmlfile_map[path]]
            rewrite_links(item.data, partial(self.resource_adder, base=dpath))

        for item in oeb.manifest.values():
            if item.media_type in self.OEB_STYLES:
                dpath = None
                for path, href in self.added_resources.items():
                    if href == item.href:
                        dpath = os.path.dirname(path)
                        break
                cssutils.replaceUrls(item.data,
                        partial(self.resource_adder, base=dpath))

        toc = self.oeb.toc
        self.oeb.auto_generated_toc = True
        titles = []
        headers = []
        for item in self.oeb.spine:
            if not item.linear:
                continue
            html = item.data
            title = ''.join(xpath(html, '/h:html/h:head/h:title/text()'))
            title = re.sub(r'\s+', ' ', title.strip())
            if title:
                titles.append(title)
            headers.append('(unlabled)')
            for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'):
                expr = '/h:html/h:body//h:%s[position()=1]/text()'
                header = ''.join(xpath(html, expr % tag))
                header = re.sub(r'\s+', ' ', header.strip())
                if header:
                    headers[-1] = header
                    break
        use = titles
        if len(titles) > len(set(titles)):
            use = headers
        for title, item in izip(use, self.oeb.spine):
            if not item.linear:
                continue
            toc.add(title, item.href)

        oeb.container = DirContainer(os.getcwdu(), oeb.log, ignore_opf=True)
        return oeb
    def __init__(self,
                 tree,
                 path,
                 oeb,
                 opts,
                 profile=None,
                 extra_css='',
                 user_css=''):
        self.oeb, self.opts = oeb, opts
        self.profile = profile
        if self.profile is None:
            # Use the default profile. This should really be using
            # opts.output_profile, but I don't want to risk changing it, as
            # doing so might well have hard to debug font size effects.
            from calibre.customize.ui import output_profiles
            for x in output_profiles():
                if x.short_name == 'default':
                    self.profile = x
                    break
        if self.profile is None:
            # Just in case the default profile is removed in the future :)
            self.profile = opts.output_profile
        self.logger = oeb.logger
        item = oeb.manifest.hrefs[path]
        basename = os.path.basename(path)
        cssname = os.path.splitext(basename)[0] + '.css'
        stylesheets = [html_css_stylesheet()]
        head = xpath(tree, '/h:html/h:head')
        if head:
            head = head[0]
        else:
            head = []

        # Add cssutils parsing profiles from output_profile
        for profile in self.opts.output_profile.extra_css_modules:
            cssprofiles.addProfile(profile['name'], profile['props'],
                                   profile['macros'])

        parser = CSSParser(fetcher=self._fetch_css_file,
                           log=logging.getLogger('calibre.css'))
        self.font_face_rules = []
        for elem in head:
            if (elem.tag == XHTML('style')
                    and elem.get('type', CSS_MIME) in OEB_STYLES):
                text = elem.text if elem.text else u''
                for x in elem:
                    t = getattr(x, 'text', None)
                    if t:
                        text += u'\n\n' + force_unicode(t, u'utf-8')
                    t = getattr(x, 'tail', None)
                    if t:
                        text += u'\n\n' + force_unicode(t, u'utf-8')
                if text:
                    text = XHTML_CSS_NAMESPACE + text
                    text = oeb.css_preprocessor(text)
                    stylesheet = parser.parseString(text,
                                                    href=cssname,
                                                    validate=False)
                    stylesheet.namespaces['h'] = XHTML_NS
                    stylesheets.append(stylesheet)
                    # Make links to resources absolute, since these rules will
                    # be folded into a stylesheet at the root
                    replaceUrls(stylesheet,
                                item.abshref,
                                ignoreImportRules=True)
            elif elem.tag == XHTML('link') and elem.get('href') \
                 and elem.get('rel', 'stylesheet').lower() == 'stylesheet' \
                 and elem.get('type', CSS_MIME).lower() in OEB_STYLES:
                href = urlnormalize(elem.attrib['href'])
                path = item.abshref(href)
                sitem = oeb.manifest.hrefs.get(path, None)
                if sitem is None:
                    self.logger.warn(
                        'Stylesheet %r referenced by file %r not in manifest' %
                        (path, item.href))
                    continue
                if not hasattr(sitem.data, 'cssRules'):
                    self.logger.warn(
                        'Stylesheet %r referenced by file %r is not CSS' %
                        (path, item.href))
                    continue
                stylesheets.append(sitem.data)
        csses = {'extra_css': extra_css, 'user_css': user_css}
        for w, x in csses.items():
            if x:
                try:
                    text = XHTML_CSS_NAMESPACE + x
                    stylesheet = parser.parseString(text,
                                                    href=cssname,
                                                    validate=False)
                    stylesheet.namespaces['h'] = XHTML_NS
                    stylesheets.append(stylesheet)
                except:
                    self.logger.exception('Failed to parse %s, ignoring.' % w)
                    self.logger.debug('Bad css: ')
                    self.logger.debug(x)
        rules = []
        index = 0
        self.stylesheets = set()
        self.page_rule = {}
        for stylesheet in stylesheets:
            href = stylesheet.href
            self.stylesheets.add(href)
            for rule in stylesheet.cssRules:
                rules.extend(self.flatten_rule(rule, href, index))
                index = index + 1
        rules.sort()
        self.rules = rules
        self._styles = {}
        for _, _, cssdict, text, _ in rules:
            fl = ':first-letter' in text
            if fl:
                text = text.replace(':first-letter', '')
            selector = get_css_selector(text)
            matches = selector(tree, self.logger)
            if fl:
                from lxml.builder import ElementMaker
                E = ElementMaker(namespace=XHTML_NS)
                for elem in matches:
                    for x in elem.iter():
                        if x.text:
                            punctuation_chars = []
                            text = unicode(x.text)
                            while text:
                                if not unicodedata.category(
                                        text[0]).startswith('P'):
                                    break
                                punctuation_chars.append(text[0])
                                text = text[1:]

                            special_text = u''.join(punctuation_chars) + \
                                    (text[0] if text else u'')
                            span = E.span(special_text)
                            span.tail = text[1:]
                            x.text = None
                            x.insert(0, span)
                            self.style(span)._update_cssdict(cssdict)
                            break
            else:
                for elem in matches:
                    self.style(elem)._update_cssdict(cssdict)
        for elem in xpath(tree, '//h:*[@style]'):
            self.style(elem)._apply_style_attr(url_replacer=item.abshref)
        num_pat = re.compile(r'\d+$')
        for elem in xpath(tree, '//h:img[@width or @height]'):
            style = self.style(elem)
            # Check if either height or width is not default
            is_styled = style._style.get('width', 'auto') != 'auto' or \
                    style._style.get('height', 'auto') != 'auto'
            if not is_styled:
                # Update img style dimension using width and height
                upd = {}
                for prop in ('width', 'height'):
                    val = elem.get(prop, '').strip()
                    try:
                        del elem.attrib[prop]
                    except:
                        pass
                    if val:
                        if num_pat.match(val) is not None:
                            val += 'px'
                        upd[prop] = val
                if upd:
                    style._update_cssdict(upd)
Exemple #30
0
    def __init__(self, tree, path, oeb, opts, profile=None,
            extra_css='', user_css='', base_css=''):
        self.oeb, self.opts = oeb, opts
        self.profile = profile
        if self.profile is None:
            # Use the default profile. This should really be using
            # opts.output_profile, but I don't want to risk changing it, as
            # doing so might well have hard to debug font size effects.
            from calibre.customize.ui import output_profiles
            for x in output_profiles():
                if x.short_name == 'default':
                    self.profile = x
                    break
        if self.profile is None:
            # Just in case the default profile is removed in the future :)
            self.profile = opts.output_profile
        self.body_font_size = self.profile.fbase
        self.logger = oeb.logger
        item = oeb.manifest.hrefs[path]
        basename = os.path.basename(path)
        cssname = os.path.splitext(basename)[0] + '.css'
        stylesheets = [html_css_stylesheet()]
        if base_css:
            stylesheets.append(parseString(base_css, validate=False))
        style_tags = xpath(tree, '//*[local-name()="style" or local-name()="link"]')

        # Add cssutils parsing profiles from output_profile
        for profile in self.opts.output_profile.extra_css_modules:
            cssprofiles.addProfile(profile['name'],
                                        profile['props'],
                                        profile['macros'])

        parser = CSSParser(fetcher=self._fetch_css_file,
                log=logging.getLogger('calibre.css'))
        self.font_face_rules = []
        for elem in style_tags:
            if (elem.tag == XHTML('style') and
                elem.get('type', CSS_MIME) in OEB_STYLES):
                text = elem.text if elem.text else u''
                for x in elem:
                    t = getattr(x, 'text', None)
                    if t:
                        text += u'\n\n' + force_unicode(t, u'utf-8')
                    t = getattr(x, 'tail', None)
                    if t:
                        text += u'\n\n' + force_unicode(t, u'utf-8')
                if text:
                    text = oeb.css_preprocessor(text)
                    # We handle @import rules separately
                    parser.setFetcher(lambda x: ('utf-8', b''))
                    stylesheet = parser.parseString(text, href=cssname,
                            validate=False)
                    parser.setFetcher(self._fetch_css_file)
                    for rule in stylesheet.cssRules:
                        if rule.type == rule.IMPORT_RULE:
                            ihref = item.abshref(rule.href)
                            if rule.media.mediaText == 'amzn-mobi':
                                continue
                            hrefs = self.oeb.manifest.hrefs
                            if ihref not in hrefs:
                                self.logger.warn('Ignoring missing stylesheet in @import rule:', rule.href)
                                continue
                            sitem = hrefs[ihref]
                            if sitem.media_type not in OEB_STYLES:
                                self.logger.warn('CSS @import of non-CSS file %r' % rule.href)
                                continue
                            stylesheets.append(sitem.data)
                    for rule in tuple(stylesheet.cssRules.rulesOfType(CSSRule.PAGE_RULE)):
                        stylesheet.cssRules.remove(rule)
                    # Make links to resources absolute, since these rules will
                    # be folded into a stylesheet at the root
                    replaceUrls(stylesheet, item.abshref,
                            ignoreImportRules=True)
                    stylesheets.append(stylesheet)
            elif elem.tag == XHTML('link') and elem.get('href') \
                 and elem.get('rel', 'stylesheet').lower() == 'stylesheet' \
                 and elem.get('type', CSS_MIME).lower() in OEB_STYLES:
                href = urlnormalize(elem.attrib['href'])
                path = item.abshref(href)
                sitem = oeb.manifest.hrefs.get(path, None)
                if sitem is None:
                    self.logger.warn(
                        'Stylesheet %r referenced by file %r not in manifest' %
                        (path, item.href))
                    continue
                if not hasattr(sitem.data, 'cssRules'):
                    self.logger.warn(
                    'Stylesheet %r referenced by file %r is not CSS'%(path,
                        item.href))
                    continue
                stylesheets.append(sitem.data)
        csses = {'extra_css':extra_css, 'user_css':user_css}
        for w, x in csses.items():
            if x:
                try:
                    text = x
                    stylesheet = parser.parseString(text, href=cssname,
                            validate=False)
                    stylesheets.append(stylesheet)
                except:
                    self.logger.exception('Failed to parse %s, ignoring.'%w)
                    self.logger.debug('Bad css: ')
                    self.logger.debug(x)
        rules = []
        index = 0
        self.stylesheets = set()
        self.page_rule = {}
        for sheet_index, stylesheet in enumerate(stylesheets):
            href = stylesheet.href
            self.stylesheets.add(href)
            for rule in stylesheet.cssRules:
                if rule.type == rule.MEDIA_RULE:
                    media = {rule.media.item(i) for i in
                             xrange(rule.media.length)}
                    if not media.intersection({'all', 'screen', 'amzn-kf8'}):
                        continue
                    for subrule in rule.cssRules:
                        rules.extend(self.flatten_rule(subrule, href, index, is_user_agent_sheet=sheet_index==0))
                        index += 1
                else:
                    rules.extend(self.flatten_rule(rule, href, index, is_user_agent_sheet=sheet_index==0))
                    index = index + 1
        rules.sort()
        self.rules = rules
        self._styles = {}
        pseudo_pat = re.compile(ur':{1,2}(%s)' % ('|'.join(INAPPROPRIATE_PSEUDO_CLASSES)), re.I)
        select = Select(tree, ignore_inappropriate_pseudo_classes=True)

        for _, _, cssdict, text, _ in rules:
            fl = pseudo_pat.search(text)
            try:
                matches = tuple(select(text))
            except SelectorError as err:
                self.logger.error('Ignoring CSS rule with invalid selector: %r (%s)' % (text, as_unicode(err)))
                continue

            if fl is not None:
                fl = fl.group(1)
                if fl == 'first-letter' and getattr(self.oeb,
                        'plumber_output_format', '').lower() in {u'mobi', u'docx'}:
                    # Fake first-letter
                    from lxml.builder import ElementMaker
                    E = ElementMaker(namespace=XHTML_NS)
                    for elem in matches:
                        for x in elem.iter('*'):
                            if x.text:
                                punctuation_chars = []
                                text = unicode(x.text)
                                while text:
                                    category = unicodedata.category(text[0])
                                    if category[0] not in {'P', 'Z'}:
                                        break
                                    punctuation_chars.append(text[0])
                                    text = text[1:]

                                special_text = u''.join(punctuation_chars) + \
                                        (text[0] if text else u'')
                                span = E.span(special_text)
                                span.set('data-fake-first-letter', '1')
                                span.tail = text[1:]
                                x.text = None
                                x.insert(0, span)
                                self.style(span)._update_cssdict(cssdict)
                                break
                else:  # Element pseudo-class
                    for elem in matches:
                        self.style(elem)._update_pseudo_class(fl, cssdict)
            else:
                for elem in matches:
                    self.style(elem)._update_cssdict(cssdict)
        for elem in xpath(tree, '//h:*[@style]'):
            self.style(elem)._apply_style_attr(url_replacer=item.abshref)
        num_pat = re.compile(r'[0-9.]+$')
        for elem in xpath(tree, '//h:img[@width or @height]'):
            style = self.style(elem)
            # Check if either height or width is not default
            is_styled = style._style.get('width', 'auto') != 'auto' or \
                    style._style.get('height', 'auto') != 'auto'
            if not is_styled:
                # Update img style dimension using width and height
                upd = {}
                for prop in ('width', 'height'):
                    val = elem.get(prop, '').strip()
                    try:
                        del elem.attrib[prop]
                    except:
                        pass
                    if val:
                        if num_pat.match(val) is not None:
                            val += 'px'
                        upd[prop] = val
                if upd:
                    style._update_cssdict(upd)
Exemple #31
0
    def create_oebbook(self, htmlpath, basedir, opts, log, mi):
        import uuid
        from calibre.ebooks.conversion.plumber import create_oebbook
        from calibre.ebooks.oeb.base import (
            DirContainer,
            rewrite_links,
            urlnormalize,
            urldefrag,
            BINARY_MIME,
            OEB_STYLES,
            xpath,
        )
        from calibre import guess_type
        from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
        from calibre.ebooks.html.input import get_filelist
        import cssutils, logging

        cssutils.log.setLevel(logging.WARN)
        self.OEB_STYLES = OEB_STYLES
        oeb = create_oebbook(log, None, opts, self, encoding=opts.input_encoding, populate=False)
        self.oeb = oeb

        metadata = oeb.metadata
        meta_info_to_oeb_metadata(mi, metadata, log)
        if not metadata.language:
            oeb.logger.warn("Language not specified")
            metadata.add("language", get_lang().replace("_", "-"))
        if not metadata.creator:
            oeb.logger.warn("Creator not specified")
            metadata.add("creator", self.oeb.translate(__("Unknown")))
        if not metadata.title:
            oeb.logger.warn("Title not specified")
            metadata.add("title", self.oeb.translate(__("Unknown")))
        bookid = str(uuid.uuid4())
        metadata.add("identifier", bookid, id="uuid_id", scheme="uuid")
        for ident in metadata.identifier:
            if "id" in ident.attrib:
                self.oeb.uid = metadata.identifier[0]
                break

        filelist = get_filelist(htmlpath, basedir, opts, log)
        filelist = [f for f in filelist if not f.is_binary]
        htmlfile_map = {}
        for f in filelist:
            path = f.path
            oeb.container = DirContainer(os.path.dirname(path), log, ignore_opf=True)
            bname = os.path.basename(path)
            id, href = oeb.manifest.generate(id="html", href=ascii_filename(bname))
            htmlfile_map[path] = href
            item = oeb.manifest.add(id, href, "text/html")
            item.html_input_href = bname
            oeb.spine.add(item, True)

        self.added_resources = {}
        self.log = log
        self.log("Normalizing filename cases")
        for path, href in htmlfile_map.items():
            if not self.is_case_sensitive(path):
                path = path.lower()
            self.added_resources[path] = href
        self.urlnormalize, self.DirContainer = urlnormalize, DirContainer
        self.urldefrag = urldefrag
        self.guess_type, self.BINARY_MIME = guess_type, BINARY_MIME

        self.log("Rewriting HTML links")
        for f in filelist:
            path = f.path
            dpath = os.path.dirname(path)
            oeb.container = DirContainer(dpath, log, ignore_opf=True)
            item = oeb.manifest.hrefs[htmlfile_map[path]]
            rewrite_links(item.data, partial(self.resource_adder, base=dpath))

        for item in oeb.manifest.values():
            if item.media_type in self.OEB_STYLES:
                dpath = None
                for path, href in self.added_resources.items():
                    if href == item.href:
                        dpath = os.path.dirname(path)
                        break
                cssutils.replaceUrls(item.data, partial(self.resource_adder, base=dpath))

        toc = self.oeb.toc
        self.oeb.auto_generated_toc = True
        titles = []
        headers = []
        for item in self.oeb.spine:
            if not item.linear:
                continue
            html = item.data
            title = "".join(xpath(html, "/h:html/h:head/h:title/text()"))
            title = re.sub(r"\s+", " ", title.strip())
            if title:
                titles.append(title)
            headers.append("(unlabled)")
            for tag in ("h1", "h2", "h3", "h4", "h5", "strong"):
                expr = "/h:html/h:body//h:%s[position()=1]/text()"
                header = "".join(xpath(html, expr % tag))
                header = re.sub(r"\s+", " ", header.strip())
                if header:
                    headers[-1] = header
                    break
        use = titles
        if len(titles) > len(set(titles)):
            use = headers
        for title, item in izip(use, self.oeb.spine):
            if not item.linear:
                continue
            toc.add(title, item.href)

        oeb.container = DirContainer(os.getcwdu(), oeb.log, ignore_opf=True)
        return oeb
Exemple #32
0
    def __init__(self, tree, path, oeb, opts, profile=None,
            extra_css='', user_css='', base_css=''):
        self.oeb, self.opts = oeb, opts
        self.profile = profile
        if self.profile is None:
            # Use the default profile. This should really be using
            # opts.output_profile, but I don't want to risk changing it, as
            # doing so might well have hard to debug font size effects.
            from calibre.customize.ui import output_profiles
            for x in output_profiles():
                if x.short_name == 'default':
                    self.profile = x
                    break
        if self.profile is None:
            # Just in case the default profile is removed in the future :)
            self.profile = opts.output_profile
        self.body_font_size = self.profile.fbase
        self.logger = oeb.logger
        item = oeb.manifest.hrefs[path]
        basename = os.path.basename(path)
        cssname = os.path.splitext(basename)[0] + '.css'
        stylesheets = [html_css_stylesheet()]
        if base_css:
            stylesheets.append(parseString(base_css, validate=False))
        style_tags = xpath(tree, '//*[local-name()="style" or local-name()="link"]')

        # Add cssutils parsing profiles from output_profile
        for profile in self.opts.output_profile.extra_css_modules:
            cssprofiles.addProfile(profile['name'],
                                        profile['props'],
                                        profile['macros'])

        parser = CSSParser(fetcher=self._fetch_css_file,
                log=logging.getLogger('calibre.css'))
        self.font_face_rules = []
        for elem in style_tags:
            if (elem.tag == XHTML('style') and elem.get('type', CSS_MIME) in OEB_STYLES and media_ok(elem.get('media'))):
                text = elem.text if elem.text else u''
                for x in elem:
                    t = getattr(x, 'text', None)
                    if t:
                        text += u'\n\n' + force_unicode(t, u'utf-8')
                    t = getattr(x, 'tail', None)
                    if t:
                        text += u'\n\n' + force_unicode(t, u'utf-8')
                if text:
                    text = oeb.css_preprocessor(text)
                    # We handle @import rules separately
                    parser.setFetcher(lambda x: ('utf-8', b''))
                    stylesheet = parser.parseString(text, href=cssname,
                            validate=False)
                    parser.setFetcher(self._fetch_css_file)
                    for rule in stylesheet.cssRules:
                        if rule.type == rule.IMPORT_RULE:
                            ihref = item.abshref(rule.href)
                            if not media_ok(rule.media.mediaText):
                                continue
                            hrefs = self.oeb.manifest.hrefs
                            if ihref not in hrefs:
                                self.logger.warn('Ignoring missing stylesheet in @import rule:', rule.href)
                                continue
                            sitem = hrefs[ihref]
                            if sitem.media_type not in OEB_STYLES:
                                self.logger.warn('CSS @import of non-CSS file %r' % rule.href)
                                continue
                            stylesheets.append(sitem.data)
                    # Make links to resources absolute, since these rules will
                    # be folded into a stylesheet at the root
                    replaceUrls(stylesheet, item.abshref,
                            ignoreImportRules=True)
                    stylesheets.append(stylesheet)
            elif (elem.tag == XHTML('link') and elem.get('href') and elem.get(
                    'rel', 'stylesheet').lower() == 'stylesheet' and elem.get(
                    'type', CSS_MIME).lower() in OEB_STYLES and media_ok(elem.get('media'))
                ):
                href = urlnormalize(elem.attrib['href'])
                path = item.abshref(href)
                sitem = oeb.manifest.hrefs.get(path, None)
                if sitem is None:
                    self.logger.warn(
                        'Stylesheet %r referenced by file %r not in manifest' %
                        (path, item.href))
                    continue
                if not hasattr(sitem.data, 'cssRules'):
                    self.logger.warn(
                    'Stylesheet %r referenced by file %r is not CSS'%(path,
                        item.href))
                    continue
                stylesheets.append(sitem.data)
        csses = {'extra_css':extra_css, 'user_css':user_css}
        for w, x in csses.items():
            if x:
                try:
                    text = x
                    stylesheet = parser.parseString(text, href=cssname,
                            validate=False)
                    stylesheets.append(stylesheet)
                except:
                    self.logger.exception('Failed to parse %s, ignoring.'%w)
                    self.logger.debug('Bad css: ')
                    self.logger.debug(x)
        rules = []
        index = 0
        self.stylesheets = set()
        self.page_rule = {}
        for sheet_index, stylesheet in enumerate(stylesheets):
            href = stylesheet.href
            self.stylesheets.add(href)
            for rule in stylesheet.cssRules:
                if rule.type == rule.MEDIA_RULE:
                    if media_ok(rule.media.mediaText):
                        for subrule in rule.cssRules:
                            rules.extend(self.flatten_rule(subrule, href, index, is_user_agent_sheet=sheet_index==0))
                            index += 1
                else:
                    rules.extend(self.flatten_rule(rule, href, index, is_user_agent_sheet=sheet_index==0))
                    index = index + 1
        rules.sort()
        self.rules = rules
        self._styles = {}
        pseudo_pat = re.compile(ur':{1,2}(%s)' % ('|'.join(INAPPROPRIATE_PSEUDO_CLASSES)), re.I)
        select = Select(tree, ignore_inappropriate_pseudo_classes=True)

        for _, _, cssdict, text, _ in rules:
            fl = pseudo_pat.search(text)
            try:
                matches = tuple(select(text))
            except SelectorError as err:
                self.logger.error('Ignoring CSS rule with invalid selector: %r (%s)' % (text, as_unicode(err)))
                continue

            if fl is not None:
                fl = fl.group(1)
                if fl == 'first-letter' and getattr(self.oeb,
                        'plumber_output_format', '').lower() in {u'mobi', u'docx'}:
                    # Fake first-letter
                    for elem in matches:
                        for x in elem.iter('*'):
                            if x.text:
                                punctuation_chars = []
                                text = unicode(x.text)
                                while text:
                                    category = unicodedata.category(text[0])
                                    if category[0] not in {'P', 'Z'}:
                                        break
                                    punctuation_chars.append(text[0])
                                    text = text[1:]

                                special_text = u''.join(punctuation_chars) + \
                                        (text[0] if text else u'')
                                span = x.makeelement('{%s}span' % XHTML_NS)
                                span.text = special_text
                                span.set('data-fake-first-letter', '1')
                                span.tail = text[1:]
                                x.text = None
                                x.insert(0, span)
                                self.style(span)._update_cssdict(cssdict)
                                break
                else:  # Element pseudo-class
                    for elem in matches:
                        self.style(elem)._update_pseudo_class(fl, cssdict)
            else:
                for elem in matches:
                    self.style(elem)._update_cssdict(cssdict)
        for elem in xpath(tree, '//h:*[@style]'):
            self.style(elem)._apply_style_attr(url_replacer=item.abshref)
        num_pat = re.compile(r'[0-9.]+$')
        for elem in xpath(tree, '//h:img[@width or @height]'):
            style = self.style(elem)
            # Check if either height or width is not default
            is_styled = style._style.get('width', 'auto') != 'auto' or \
                    style._style.get('height', 'auto') != 'auto'
            if not is_styled:
                # Update img style dimension using width and height
                upd = {}
                for prop in ('width', 'height'):
                    val = elem.get(prop, '').strip()
                    try:
                        del elem.attrib[prop]
                    except:
                        pass
                    if val:
                        if num_pat.match(val) is not None:
                            val += 'px'
                        upd[prop] = val
                if upd:
                    style._update_cssdict(upd)
Exemple #33
0
    def create_oebbook(self, htmlpath, basedir, opts, log, mi):
        import uuid
        from calibre.ebooks.conversion.plumber import create_oebbook
        from calibre.ebooks.oeb.base import (DirContainer, rewrite_links,
                                             urlnormalize, urldefrag,
                                             BINARY_MIME, OEB_STYLES, xpath,
                                             urlquote)
        from calibre import guess_type
        from calibre.ebooks.oeb.transforms.metadata import \
            meta_info_to_oeb_metadata
        from calibre.ebooks.html.input import get_filelist
        from calibre.ebooks.metadata import string_to_authors
        from calibre.utils.localization import canonicalize_lang
        import css_parser, logging
        css_parser.log.setLevel(logging.WARN)
        self.OEB_STYLES = OEB_STYLES
        oeb = create_oebbook(log,
                             None,
                             opts,
                             self,
                             encoding=opts.input_encoding,
                             populate=False)
        self.oeb = oeb

        metadata = oeb.metadata
        meta_info_to_oeb_metadata(mi, metadata, log)
        if not metadata.language:
            l = canonicalize_lang(getattr(opts, 'language', None))
            if not l:
                oeb.logger.warn('Language not specified')
                l = get_lang().replace('_', '-')
            metadata.add('language', l)
        if not metadata.creator:
            a = getattr(opts, 'authors', None)
            if a:
                a = string_to_authors(a)
            if not a:
                oeb.logger.warn('Creator not specified')
                a = [self.oeb.translate(__('Unknown'))]
            for aut in a:
                metadata.add('creator', aut)
        if not metadata.title:
            oeb.logger.warn('Title not specified')
            metadata.add('title', self.oeb.translate(__('Unknown')))
        bookid = unicode_type(uuid.uuid4())
        metadata.add('identifier', bookid, id='uuid_id', scheme='uuid')
        for ident in metadata.identifier:
            if 'id' in ident.attrib:
                self.oeb.uid = metadata.identifier[0]
                break

        filelist = get_filelist(htmlpath, basedir, opts, log)
        filelist = [f for f in filelist if not f.is_binary]
        htmlfile_map = {}
        for f in filelist:
            path = f.path
            oeb.container = DirContainer(os.path.dirname(path),
                                         log,
                                         ignore_opf=True)
            bname = os.path.basename(path)
            id, href = oeb.manifest.generate(id='html',
                                             href=sanitize_file_name(bname))
            htmlfile_map[path] = href
            item = oeb.manifest.add(id, href, 'text/html')
            if path == htmlpath and '%' in path:
                bname = urlquote(bname)
            item.html_input_href = bname
            oeb.spine.add(item, True)

        self.added_resources = {}
        self.log = log
        self.log('Normalizing filename cases')
        for path, href in htmlfile_map.items():
            if not self.is_case_sensitive(path):
                path = path.lower()
            self.added_resources[path] = href
        self.urlnormalize, self.DirContainer = urlnormalize, DirContainer
        self.urldefrag = urldefrag
        self.guess_type, self.BINARY_MIME = guess_type, BINARY_MIME

        self.log('Rewriting HTML links')
        for f in filelist:
            path = f.path
            dpath = os.path.dirname(path)
            oeb.container = DirContainer(dpath, log, ignore_opf=True)
            href = htmlfile_map[path]
            try:
                item = oeb.manifest.hrefs[href]
            except KeyError:
                item = oeb.manifest.hrefs[urlnormalize(href)]
            rewrite_links(item.data, partial(self.resource_adder, base=dpath))

        for item in oeb.manifest.values():
            if item.media_type in self.OEB_STYLES:
                dpath = None
                for path, href in self.added_resources.items():
                    if href == item.href:
                        dpath = os.path.dirname(path)
                        break
                css_parser.replaceUrls(
                    item.data, partial(self.resource_adder, base=dpath))

        toc = self.oeb.toc
        self.oeb.auto_generated_toc = True
        titles = []
        headers = []
        for item in self.oeb.spine:
            if not item.linear:
                continue
            html = item.data
            title = ''.join(xpath(html, '/h:html/h:head/h:title/text()'))
            title = re.sub(r'\s+', ' ', title.strip())
            if title:
                titles.append(title)
            headers.append('(unlabled)')
            for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'):
                expr = '/h:html/h:body//h:%s[position()=1]/text()'
                header = ''.join(xpath(html, expr % tag))
                header = re.sub(r'\s+', ' ', header.strip())
                if header:
                    headers[-1] = header
                    break
        use = titles
        if len(titles) > len(set(titles)):
            use = headers
        for title, item in zip(use, self.oeb.spine):
            if not item.linear:
                continue
            toc.add(title, item.href)

        oeb.container = DirContainer(getcwd(), oeb.log, ignore_opf=True)
        return oeb
 def _metadata_from_opf(self, opf):
     for e in xpath(opf, 'o2:metadata//o2:meta'):
         if e.attrib.get('name') == 'original-resolution':
             comic_book_exth_values['original-resolution'] = e.attrib.get('content', '660x800')
     return OEBReader._metadata_from_opf(self, opf)
Exemple #35
0
    def __init__(self, tree, path, oeb, opts, profile=None, extra_css="", user_css=""):
        self.oeb, self.opts = oeb, opts
        self.profile = profile
        if self.profile is None:
            # Use the default profile. This should really be using
            # opts.output_profile, but I don't want to risk changing it, as
            # doing so might well have hard to debug font size effects.
            from calibre.customize.ui import output_profiles

            for x in output_profiles():
                if x.short_name == "default":
                    self.profile = x
                    break
        if self.profile is None:
            # Just in case the default profile is removed in the future :)
            self.profile = opts.output_profile
        self.body_font_size = self.profile.fbase
        self.logger = oeb.logger
        item = oeb.manifest.hrefs[path]
        basename = os.path.basename(path)
        cssname = os.path.splitext(basename)[0] + ".css"
        stylesheets = [html_css_stylesheet()]
        style_tags = xpath(tree, '//*[local-name()="style" or local-name()="link"]')

        # Add cssutils parsing profiles from output_profile
        for profile in self.opts.output_profile.extra_css_modules:
            cssprofiles.addProfile(profile["name"], profile["props"], profile["macros"])

        parser = CSSParser(fetcher=self._fetch_css_file, log=logging.getLogger("calibre.css"))
        self.font_face_rules = []
        for elem in style_tags:
            if elem.tag == XHTML("style") and elem.get("type", CSS_MIME) in OEB_STYLES:
                text = elem.text if elem.text else u""
                for x in elem:
                    t = getattr(x, "text", None)
                    if t:
                        text += u"\n\n" + force_unicode(t, u"utf-8")
                    t = getattr(x, "tail", None)
                    if t:
                        text += u"\n\n" + force_unicode(t, u"utf-8")
                if text:
                    text = oeb.css_preprocessor(text, add_namespace=True)
                    # We handle @import rules separately
                    parser.setFetcher(lambda x: ("utf-8", b""))
                    stylesheet = parser.parseString(text, href=cssname, validate=False)
                    parser.setFetcher(self._fetch_css_file)
                    stylesheet.namespaces["h"] = XHTML_NS
                    for rule in stylesheet.cssRules:
                        if rule.type == rule.IMPORT_RULE:
                            ihref = item.abshref(rule.href)
                            if rule.media.mediaText == "amzn-mobi":
                                continue
                            hrefs = self.oeb.manifest.hrefs
                            if ihref not in hrefs:
                                self.logger.warn("Ignoring missing stylesheet in @import rule:", rule.href)
                                continue
                            sitem = hrefs[ihref]
                            if sitem.media_type not in OEB_STYLES:
                                self.logger.warn("CSS @import of non-CSS file %r" % rule.href)
                                continue
                            stylesheets.append(sitem.data)
                    for rule in tuple(stylesheet.cssRules.rulesOfType(CSSRule.PAGE_RULE)):
                        stylesheet.cssRules.remove(rule)
                    # Make links to resources absolute, since these rules will
                    # be folded into a stylesheet at the root
                    replaceUrls(stylesheet, item.abshref, ignoreImportRules=True)
                    stylesheets.append(stylesheet)
            elif (
                elem.tag == XHTML("link")
                and elem.get("href")
                and elem.get("rel", "stylesheet").lower() == "stylesheet"
                and elem.get("type", CSS_MIME).lower() in OEB_STYLES
            ):
                href = urlnormalize(elem.attrib["href"])
                path = item.abshref(href)
                sitem = oeb.manifest.hrefs.get(path, None)
                if sitem is None:
                    self.logger.warn("Stylesheet %r referenced by file %r not in manifest" % (path, item.href))
                    continue
                if not hasattr(sitem.data, "cssRules"):
                    self.logger.warn("Stylesheet %r referenced by file %r is not CSS" % (path, item.href))
                    continue
                stylesheets.append(sitem.data)
        csses = {"extra_css": extra_css, "user_css": user_css}
        for w, x in csses.items():
            if x:
                try:
                    text = XHTML_CSS_NAMESPACE + x
                    stylesheet = parser.parseString(text, href=cssname, validate=False)
                    stylesheet.namespaces["h"] = XHTML_NS
                    stylesheets.append(stylesheet)
                except:
                    self.logger.exception("Failed to parse %s, ignoring." % w)
                    self.logger.debug("Bad css: ")
                    self.logger.debug(x)
        rules = []
        index = 0
        self.stylesheets = set()
        self.page_rule = {}
        for sheet_index, stylesheet in enumerate(stylesheets):
            href = stylesheet.href
            self.stylesheets.add(href)
            for rule in stylesheet.cssRules:
                if rule.type == rule.MEDIA_RULE:
                    media = {rule.media.item(i) for i in xrange(rule.media.length)}
                    if not media.intersection({"all", "screen", "amzn-kf8"}):
                        continue
                    for subrule in rule.cssRules:
                        rules.extend(self.flatten_rule(subrule, href, index, is_user_agent_sheet=sheet_index == 0))
                        index += 1
                else:
                    rules.extend(self.flatten_rule(rule, href, index, is_user_agent_sheet=sheet_index == 0))
                    index = index + 1
        rules.sort()
        self.rules = rules
        self._styles = {}
        pseudo_pat = re.compile(ur":(first-letter|first-line|link|hover|visited|active|focus|before|after)", re.I)
        for _, _, cssdict, text, _ in rules:
            fl = pseudo_pat.search(text)
            if fl is not None:
                text = text.replace(fl.group(), "")
            selector = get_css_selector(text, self.oeb.log)
            matches = selector(tree, self.logger)
            if fl is not None:
                fl = fl.group(1)
                if fl == "first-letter" and getattr(self.oeb, "plumber_output_format", "").lower() == u"mobi":
                    # Fake first-letter
                    from lxml.builder import ElementMaker

                    E = ElementMaker(namespace=XHTML_NS)
                    for elem in matches:
                        for x in elem.iter():
                            if x.text:
                                punctuation_chars = []
                                text = unicode(x.text)
                                while text:
                                    category = unicodedata.category(text[0])
                                    if category[0] not in {"P", "Z"}:
                                        break
                                    punctuation_chars.append(text[0])
                                    text = text[1:]

                                special_text = u"".join(punctuation_chars) + (text[0] if text else u"")
                                span = E.span(special_text)
                                span.tail = text[1:]
                                x.text = None
                                x.insert(0, span)
                                self.style(span)._update_cssdict(cssdict)
                                break
                else:  # Element pseudo-class
                    for elem in matches:
                        self.style(elem)._update_pseudo_class(fl, cssdict)
            else:
                for elem in matches:
                    self.style(elem)._update_cssdict(cssdict)
        for elem in xpath(tree, "//h:*[@style]"):
            self.style(elem)._apply_style_attr(url_replacer=item.abshref)
        num_pat = re.compile(r"[0-9.]+$")
        for elem in xpath(tree, "//h:img[@width or @height]"):
            style = self.style(elem)
            # Check if either height or width is not default
            is_styled = style._style.get("width", "auto") != "auto" or style._style.get("height", "auto") != "auto"
            if not is_styled:
                # Update img style dimension using width and height
                upd = {}
                for prop in ("width", "height"):
                    val = elem.get(prop, "").strip()
                    try:
                        del elem.attrib[prop]
                    except:
                        pass
                    if val:
                        if num_pat.match(val) is not None:
                            val += "px"
                        upd[prop] = val
                if upd:
                    style._update_cssdict(upd)
Exemple #36
0
    def __init__(self,
                 tree,
                 path,
                 oeb,
                 opts,
                 profile=None,
                 extra_css='',
                 user_css=''):
        self.oeb, self.opts = oeb, opts
        self.profile = profile
        if self.profile is None:
            self.profile = opts.output_profile
        self.logger = oeb.logger
        item = oeb.manifest.hrefs[path]
        basename = os.path.basename(path)
        cssname = os.path.splitext(basename)[0] + '.css'
        stylesheets = [html_css_stylesheet()]
        head = xpath(tree, '/h:html/h:head')
        if head:
            head = head[0]
        else:
            head = []

        # Add cssutils parsing profiles from output_profile
        for profile in self.opts.output_profile.extra_css_modules:
            cssprofiles.addProfile(profile['name'], profile['props'],
                                   profile['macros'])

        parser = CSSParser(fetcher=self._fetch_css_file,
                           log=logging.getLogger('calibre.css'))
        self.font_face_rules = []
        for elem in head:
            if (elem.tag == XHTML('style')
                    and elem.get('type', CSS_MIME) in OEB_STYLES):
                text = elem.text if elem.text else u''
                for x in elem:
                    t = getattr(x, 'text', None)
                    if t:
                        text += u'\n\n' + force_unicode(t, u'utf-8')
                    t = getattr(x, 'tail', None)
                    if t:
                        text += u'\n\n' + force_unicode(t, u'utf-8')
                if text:
                    text = oeb.css_preprocessor(text, add_namespace=True)
                    # We handle @import rules separately
                    parser.setFetcher(lambda x: ('utf-8', b''))
                    stylesheet = parser.parseString(text,
                                                    href=cssname,
                                                    validate=False)
                    parser.setFetcher(self._fetch_css_file)
                    stylesheet.namespaces['h'] = XHTML_NS
                    for rule in stylesheet.cssRules:
                        if rule.type == rule.IMPORT_RULE:
                            ihref = item.abshref(rule.href)
                            if rule.media.mediaText == 'amzn-mobi':
                                continue
                            hrefs = self.oeb.manifest.hrefs
                            if ihref not in hrefs:
                                self.logger.warn(
                                    'Ignoring missing stylesheet in @import rule:',
                                    rule.href)
                                continue
                            sitem = hrefs[ihref]
                            if sitem.media_type not in OEB_STYLES:
                                self.logger.warn(
                                    'CSS @import of non-CSS file %r' %
                                    rule.href)
                                continue
                            stylesheets.append(sitem.data)
                    # Make links to resources absolute, since these rules will
                    # be folded into a stylesheet at the root
                    replaceUrls(stylesheet,
                                item.abshref,
                                ignoreImportRules=True)
                    stylesheets.append(stylesheet)
            elif elem.tag == XHTML('link') and elem.get('href') \
                 and elem.get('rel', 'stylesheet').lower() == 'stylesheet' \
                 and elem.get('type', CSS_MIME).lower() in OEB_STYLES:
                href = urlnormalize(elem.attrib['href'])
                path = item.abshref(href)
                sitem = oeb.manifest.hrefs.get(path, None)
                if sitem is None:
                    self.logger.warn(
                        'Stylesheet %r referenced by file %r not in manifest' %
                        (path, item.href))
                    continue
                if not hasattr(sitem.data, 'cssRules'):
                    self.logger.warn(
                        'Stylesheet %r referenced by file %r is not CSS' %
                        (path, item.href))
                    continue
                stylesheets.append(sitem.data)
        csses = {'extra_css': extra_css, 'user_css': user_css}
        for w, x in csses.items():
            if x:
                try:
                    text = XHTML_CSS_NAMESPACE + x
                    stylesheet = parser.parseString(text,
                                                    href=cssname,
                                                    validate=False)
                    stylesheet.namespaces['h'] = XHTML_NS
                    stylesheets.append(stylesheet)
                except:
                    self.logger.exception('Failed to parse %s, ignoring.' % w)
                    self.logger.debug('Bad css: ')
                    self.logger.debug(x)
        rules = []
        index = 0
        self.stylesheets = set()
        self.page_rule = {}
        for stylesheet in stylesheets:
            href = stylesheet.href
            self.stylesheets.add(href)
            for rule in stylesheet.cssRules:
                if rule.type == rule.MEDIA_RULE:
                    media = {
                        rule.media.item(i)
                        for i in xrange(rule.media.length)
                    }
                    if not media.intersection({'all', 'screen', 'amzn-kf8'}):
                        continue
                    for subrule in rule.cssRules:
                        rules.extend(self.flatten_rule(subrule, href, index))
                        index += 1
                else:
                    rules.extend(self.flatten_rule(rule, href, index))
                    index = index + 1
        rules.sort()
        self.rules = rules
        self._styles = {}
        pseudo_pat = re.compile(
            ur':(first-letter|first-line|link|hover|visited|active|focus|before|after)',
            re.I)
        for _, _, cssdict, text, _ in rules:
            fl = pseudo_pat.search(text)
            if fl is not None:
                text = text.replace(fl.group(), '')
            selector = get_css_selector(text, self.oeb.log)
            matches = selector(tree, self.logger)
            if fl is not None:
                fl = fl.group(1)
                if fl == 'first-letter' and getattr(self.oeb,
                                                    'plumber_output_format',
                                                    '').lower() == u'mobi':
                    # Fake first-letter
                    from lxml.builder import ElementMaker
                    E = ElementMaker(namespace=XHTML_NS)
                    for elem in matches:
                        for x in elem.iter():
                            if x.text:
                                punctuation_chars = []
                                text = unicode(x.text)
                                while text:
                                    category = unicodedata.category(text[0])
                                    if category[0] not in {'P', 'Z'}:
                                        break
                                    punctuation_chars.append(text[0])
                                    text = text[1:]

                                special_text = u''.join(punctuation_chars) + \
                                        (text[0] if text else u'')
                                span = E.span(special_text)
                                span.tail = text[1:]
                                x.text = None
                                x.insert(0, span)
                                self.style(span)._update_cssdict(cssdict)
                                break
                else:  # Element pseudo-class
                    for elem in matches:
                        self.style(elem)._update_pseudo_class(fl, cssdict)
            else:
                for elem in matches:
                    self.style(elem)._update_cssdict(cssdict)
        for elem in xpath(tree, '//h:*[@style]'):
            self.style(elem)._apply_style_attr(url_replacer=item.abshref)
        num_pat = re.compile(r'\d+$')
        for elem in xpath(tree, '//h:img[@width or @height]'):
            style = self.style(elem)
            # Check if either height or width is not default
            is_styled = style._style.get('width', 'auto') != 'auto' or \
                    style._style.get('height', 'auto') != 'auto'
            if not is_styled:
                # Update img style dimension using width and height
                upd = {}
                for prop in ('width', 'height'):
                    val = elem.get(prop, '').strip()
                    try:
                        del elem.attrib[prop]
                    except:
                        pass
                    if val:
                        if num_pat.match(val) is not None:
                            val += 'px'
                        upd[prop] = val
                if upd:
                    style._update_cssdict(upd)
Exemple #37
0
    def __init__(self, tree, path, oeb, opts, profile=None,
            extra_css='', user_css=''):
        self.oeb, self.opts = oeb, opts
        self.profile = profile
        if self.profile is None:
            self.profile = opts.output_profile
        self.logger = oeb.logger
        item = oeb.manifest.hrefs[path]
        basename = os.path.basename(path)
        cssname = os.path.splitext(basename)[0] + '.css'
        stylesheets = [html_css_stylesheet()]
        head = xpath(tree, '/h:html/h:head')
        if head:
            head = head[0]
        else:
            head = []

        # Add cssutils parsing profiles from output_profile
        for profile in self.opts.output_profile.extra_css_modules:
            cssprofiles.addProfile(profile['name'],
                                        profile['props'],
                                        profile['macros'])

        parser = CSSParser(fetcher=self._fetch_css_file,
                log=logging.getLogger('calibre.css'))
        self.font_face_rules = []
        for elem in head:
            if (elem.tag == XHTML('style') and
                elem.get('type', CSS_MIME) in OEB_STYLES):
                text = elem.text if elem.text else u''
                for x in elem:
                    t = getattr(x, 'text', None)
                    if t:
                        text += u'\n\n' + force_unicode(t, u'utf-8')
                    t = getattr(x, 'tail', None)
                    if t:
                        text += u'\n\n' + force_unicode(t, u'utf-8')
                if text:
                    text = oeb.css_preprocessor(text, add_namespace=True)
                    # We handle @import rules separately
                    parser.setFetcher(lambda x: ('utf-8', b''))
                    stylesheet = parser.parseString(text, href=cssname,
                            validate=False)
                    parser.setFetcher(self._fetch_css_file)
                    stylesheet.namespaces['h'] = XHTML_NS
                    for rule in stylesheet.cssRules:
                        if rule.type == rule.IMPORT_RULE:
                            ihref = item.abshref(rule.href)
                            if rule.media.mediaText == 'amzn-mobi':
                                continue
                            hrefs = self.oeb.manifest.hrefs
                            if ihref not in hrefs:
                                self.logger.warn('Ignoring missing stylesheet in @import rule:', rule.href)
                                continue
                            sitem = hrefs[ihref]
                            if sitem.media_type not in OEB_STYLES:
                                self.logger.warn('CSS @import of non-CSS file %r' % rule.href)
                                continue
                            stylesheets.append(sitem.data)
                    # Make links to resources absolute, since these rules will
                    # be folded into a stylesheet at the root
                    replaceUrls(stylesheet, item.abshref,
                            ignoreImportRules=True)
                    stylesheets.append(stylesheet)
            elif elem.tag == XHTML('link') and elem.get('href') \
                 and elem.get('rel', 'stylesheet').lower() == 'stylesheet' \
                 and elem.get('type', CSS_MIME).lower() in OEB_STYLES:
                href = urlnormalize(elem.attrib['href'])
                path = item.abshref(href)
                sitem = oeb.manifest.hrefs.get(path, None)
                if sitem is None:
                    self.logger.warn(
                        'Stylesheet %r referenced by file %r not in manifest' %
                        (path, item.href))
                    continue
                if not hasattr(sitem.data, 'cssRules'):
                    self.logger.warn(
                    'Stylesheet %r referenced by file %r is not CSS'%(path,
                        item.href))
                    continue
                stylesheets.append(sitem.data)
        csses = {'extra_css':extra_css, 'user_css':user_css}
        for w, x in csses.items():
            if x:
                try:
                    text = XHTML_CSS_NAMESPACE + x
                    stylesheet = parser.parseString(text, href=cssname,
                            validate=False)
                    stylesheet.namespaces['h'] = XHTML_NS
                    stylesheets.append(stylesheet)
                except:
                    self.logger.exception('Failed to parse %s, ignoring.'%w)
                    self.logger.debug('Bad css: ')
                    self.logger.debug(x)
        rules = []
        index = 0
        self.stylesheets = set()
        self.page_rule = {}
        for stylesheet in stylesheets:
            href = stylesheet.href
            self.stylesheets.add(href)
            for rule in stylesheet.cssRules:
                if rule.type == rule.MEDIA_RULE:
                    media = {rule.media.item(i) for i in
                             xrange(rule.media.length)}
                    if not media.intersection({'all', 'screen', 'amzn-kf8'}):
                        continue
                    for subrule in rule.cssRules:
                        rules.extend(self.flatten_rule(subrule, href, index))
                        index += 1
                else:
                    rules.extend(self.flatten_rule(rule, href, index))
                    index = index + 1
        rules.sort()
        self.rules = rules
        self._styles = {}
        pseudo_pat = re.compile(ur':(first-letter|first-line|link|hover|visited|active|focus|before|after)', re.I)
        for _, _, cssdict, text, _ in rules:
            fl = pseudo_pat.search(text)
            if fl is not None:
                text = text.replace(fl.group(), '')
            selector = get_css_selector(text, self.oeb.log)
            matches = selector(tree, self.logger)
            if fl is not None:
                fl = fl.group(1)
                if fl == 'first-letter' and getattr(self.oeb,
                        'plumber_output_format', '').lower() == u'mobi':
                    # Fake first-letter
                    from lxml.builder import ElementMaker
                    E = ElementMaker(namespace=XHTML_NS)
                    for elem in matches:
                        for x in elem.iter():
                            if x.text:
                                punctuation_chars = []
                                text = unicode(x.text)
                                while text:
                                    category = unicodedata.category(text[0])
                                    if category[0] not in {'P', 'Z'}:
                                        break
                                    punctuation_chars.append(text[0])
                                    text = text[1:]

                                special_text = u''.join(punctuation_chars) + \
                                        (text[0] if text else u'')
                                span = E.span(special_text)
                                span.tail = text[1:]
                                x.text = None
                                x.insert(0, span)
                                self.style(span)._update_cssdict(cssdict)
                                break
                else:  # Element pseudo-class
                    for elem in matches:
                        self.style(elem)._update_pseudo_class(fl, cssdict)
            else:
                for elem in matches:
                    self.style(elem)._update_cssdict(cssdict)
        for elem in xpath(tree, '//h:*[@style]'):
            self.style(elem)._apply_style_attr(url_replacer=item.abshref)
        num_pat = re.compile(r'\d+$')
        for elem in xpath(tree, '//h:img[@width or @height]'):
            style = self.style(elem)
            # Check if either height or width is not default
            is_styled = style._style.get('width', 'auto') != 'auto' or \
                    style._style.get('height', 'auto') != 'auto'
            if not is_styled:
                # Update img style dimension using width and height
                upd = {}
                for prop in ('width', 'height'):
                    val = elem.get(prop, '').strip()
                    try:
                        del elem.attrib[prop]
                    except:
                        pass
                    if val:
                        if num_pat.match(val) is not None:
                            val += 'px'
                        upd[prop] = val
                if upd:
                    style._update_cssdict(upd)
Exemple #38
0
    def __init__(self, tree, path, oeb, opts, profile=None,
            extra_css='', user_css=''):
        self.oeb, self.opts = oeb, opts
        self.profile = profile
        if self.profile is None:
            # Use the default profile. This should really be using
            # opts.output_profile, but I don't want to risk changing it, as
            # doing so might well have hard to debug font size effects.
            from calibre.customize.ui import output_profiles
            for x in output_profiles():
                if x.short_name == 'default':
                    self.profile = x
                    break
        if self.profile is None:
            # Just in case the default profile is removed in the future :)
            self.profile = opts.output_profile
        self.logger = oeb.logger
        item = oeb.manifest.hrefs[path]
        basename = os.path.basename(path)
        cssname = os.path.splitext(basename)[0] + '.css'
        stylesheets = [html_css_stylesheet()]
        head = xpath(tree, '/h:html/h:head')
        if head:
            head = head[0]
        else:
            head = []

        # Add cssutils parsing profiles from output_profile
        for profile in self.opts.output_profile.extra_css_modules:
            cssprofiles.addProfile(profile['name'],
                                        profile['props'],
                                        profile['macros'])

        parser = CSSParser(fetcher=self._fetch_css_file,
                log=logging.getLogger('calibre.css'))
        self.font_face_rules = []
        for elem in head:
            if (elem.tag == XHTML('style') and
                elem.get('type', CSS_MIME) in OEB_STYLES):
                text = elem.text if elem.text else u''
                for x in elem:
                    t = getattr(x, 'text', None)
                    if t:
                        text += u'\n\n' + force_unicode(t, u'utf-8')
                    t = getattr(x, 'tail', None)
                    if t:
                        text += u'\n\n' + force_unicode(t, u'utf-8')
                if text:
                    text = XHTML_CSS_NAMESPACE + text
                    text = oeb.css_preprocessor(text)
                    stylesheet = parser.parseString(text, href=cssname,
                            validate=False)
                    stylesheet.namespaces['h'] = XHTML_NS
                    stylesheets.append(stylesheet)
                    # Make links to resources absolute, since these rules will
                    # be folded into a stylesheet at the root
                    replaceUrls(stylesheet, item.abshref,
                            ignoreImportRules=True)
            elif elem.tag == XHTML('link') and elem.get('href') \
                 and elem.get('rel', 'stylesheet').lower() == 'stylesheet' \
                 and elem.get('type', CSS_MIME).lower() in OEB_STYLES:
                href = urlnormalize(elem.attrib['href'])
                path = item.abshref(href)
                sitem = oeb.manifest.hrefs.get(path, None)
                if sitem is None:
                    self.logger.warn(
                        'Stylesheet %r referenced by file %r not in manifest' %
                        (path, item.href))
                    continue
                if not hasattr(sitem.data, 'cssRules'):
                    self.logger.warn(
                    'Stylesheet %r referenced by file %r is not CSS'%(path,
                        item.href))
                    continue
                stylesheets.append(sitem.data)
        csses = {'extra_css':extra_css, 'user_css':user_css}
        for w, x in csses.items():
            if x:
                try:
                    text = XHTML_CSS_NAMESPACE + x
                    stylesheet = parser.parseString(text, href=cssname,
                            validate=False)
                    stylesheet.namespaces['h'] = XHTML_NS
                    stylesheets.append(stylesheet)
                except:
                    self.logger.exception('Failed to parse %s, ignoring.'%w)
                    self.logger.debug('Bad css: ')
                    self.logger.debug(x)
        rules = []
        index = 0
        self.stylesheets = set()
        self.page_rule = {}
        for stylesheet in stylesheets:
            href = stylesheet.href
            self.stylesheets.add(href)
            for rule in stylesheet.cssRules:
                rules.extend(self.flatten_rule(rule, href, index))
                index = index + 1
        rules.sort()
        self.rules = rules
        self._styles = {}
        for _, _, cssdict, text, _ in rules:
            fl = ':first-letter' in text
            if fl:
                text = text.replace(':first-letter', '')
            selector = get_css_selector(text)
            matches = selector(tree, self.logger)
            if fl:
                from lxml.builder import ElementMaker
                E = ElementMaker(namespace=XHTML_NS)
                for elem in matches:
                    for x in elem.iter():
                        if x.text:
                            punctuation_chars = []
                            text = unicode(x.text)
                            while text:
                                if not unicodedata.category(text[0]).startswith('P'):
                                    break
                                punctuation_chars.append(text[0])
                                text = text[1:]

                            special_text = u''.join(punctuation_chars) + \
                                    (text[0] if text else u'')
                            span = E.span(special_text)
                            span.tail = text[1:]
                            x.text = None
                            x.insert(0, span)
                            self.style(span)._update_cssdict(cssdict)
                            break
            else:
                for elem in matches:
                    self.style(elem)._update_cssdict(cssdict)
        for elem in xpath(tree, '//h:*[@style]'):
            self.style(elem)._apply_style_attr(url_replacer=item.abshref)
        num_pat = re.compile(r'\d+$')
        for elem in xpath(tree, '//h:img[@width or @height]'):
            style = self.style(elem)
            # Check if either height or width is not default
            is_styled = style._style.get('width', 'auto') != 'auto' or \
                    style._style.get('height', 'auto') != 'auto'
            if not is_styled:
                # Update img style dimension using width and height
                upd = {}
                for prop in ('width', 'height'):
                    val = elem.get(prop, '').strip()
                    try:
                        del elem.attrib[prop]
                    except:
                        pass
                    if val:
                        if num_pat.match(val) is not None:
                            val += 'px'
                        upd[prop] = val
                if upd:
                    style._update_cssdict(upd)