Python urlnormalize Examples, ebook_converter.ebooks.oeb.base.urlnormalize Python Examples

Example #1

0

Show file

 def serialize_elem(self, elem, item, nsrmap=NSRMAP):
     buf = self.buf
     if not isinstance(elem.tag, (str, bytes)) \
         or parse_utils.namespace(elem.tag) not in nsrmap:
         return
     tag = base.prefixname(elem.tag, nsrmap)
     # Previous layers take care of @name
     id_ = elem.attrib.pop('id', None)
     if id_:
         href = '#'.join((item.href, id_))
         offset = self.anchor_offset or buf.tell()
         key = base.urlnormalize(href)
         # Only set this id_offset if it wasn't previously seen
         self.id_offsets[key] = self.id_offsets.get(key, offset)
     if self.anchor_offset is not None and \
         tag == 'a' and not elem.attrib and \
         not len(elem) and not elem.text:
         return
     self.anchor_offset = buf.tell()
     buf.write(b'<')
     buf.write(tag.encode('utf-8'))
     if elem.attrib:
         for attr, val in elem.attrib.items():
             if parse_utils.namespace(attr) not in nsrmap:
                 continue
             attr = base.prefixname(attr, nsrmap)
             buf.write(b' ')
             if attr == 'href':
                 if self.serialize_href(val, item):
                     continue
             elif attr == 'src':
                 href = base.urlnormalize(item.abshref(val))
                 if href in self.images:
                     index = self.images[href]
                     self.used_images.add(href)
                     buf.write(b'recindex="%05d"' % index)
                     continue
             buf.write(attr.encode('utf-8'))
             buf.write(b'="')
             self.serialize_text(val, quot=True)
             buf.write(b'"')
     buf.write(b'>')
     if elem.text or len(elem) > 0:
         if elem.text:
             self.anchor_offset = None
             self.serialize_text(elem.text)
         for child in elem:
             self.serialize_elem(child, item)
             if child.tail:
                 self.anchor_offset = None
                 self.serialize_text(child.tail)
     buf.write(('</%s>' % tag).encode('utf-8'))

Example #2

0

Show file

File: trimmanifest.py Project: keshavbhatt/ebook-converter

 def __call__(self, oeb, context):
     import css_parser
     oeb.logger.info('Trimming unused files from manifest...')
     self.opts = context
     used = set()
     for term in oeb.metadata:
         for item in oeb.metadata[term]:
             if item.value in oeb.manifest.hrefs:
                 used.add(oeb.manifest.hrefs[item.value])
             elif item.value in oeb.manifest.ids:
                 used.add(oeb.manifest.ids[item.value])
     for ref in oeb.guide.values():
         path, _ = urllib.parse.urldefrag(ref.href)
         if path in oeb.manifest.hrefs:
             used.add(oeb.manifest.hrefs[path])
     # TOC items are required to be in the spine
     for item in oeb.spine:
         used.add(item)
     unchecked = used
     while unchecked:
         new = set()
         for item in unchecked:
             if (item.media_type in OEB_DOCS or
                 item.media_type[-4:] in ('/xml', '+xml')) and \
                item.data is not None:
                 hrefs = [r[2] for r in iterlinks(item.data)]
                 for href in hrefs:
                     if isinstance(href, bytes):
                         href = href.decode('utf-8')
                     try:
                         href = item.abshref(urlnormalize(href))
                     except:
                         continue
                     if href in oeb.manifest.hrefs:
                         found = oeb.manifest.hrefs[href]
                         if found not in used:
                             new.add(found)
             elif item.media_type == CSS_MIME:
                 for href in css_parser.getUrls(item.data):
                     href = item.abshref(urlnormalize(href))
                     if href in oeb.manifest.hrefs:
                         found = oeb.manifest.hrefs[href]
                         if found not in used:
                             new.add(found)
         used.update(new)
         unchecked = new
     for item in oeb.manifest.values():
         if item not in used:
             oeb.logger.info('Trimming %r from manifest' % item.href)
             oeb.manifest.remove(item)

Example #3

0

Show file

 def _toc_from_html(self, opf):
     if 'toc' not in self.oeb.guide:
         return False
     self.log.debug('Reading TOC from HTML...')
     itempath, frag = urllib.parse.urldefrag(self.oeb.guide['toc'].href)
     item = self.oeb.manifest.hrefs[itempath]
     html = item.data
     if frag:
         elems = base.xpath(html, './/*[@id="%s"]' % frag)
         if not elems:
             elems = base.xpath(html, './/*[@name="%s"]' % frag)
         elem = elems[0] if elems else html
         while elem != html and not base.xpath(elem, './/h:a[@href]'):
             elem = elem.getparent()
         html = elem
     titles = collections.defaultdict(list)
     order = []
     for anchor in base.xpath(html, './/h:a[@href]'):
         href = anchor.attrib['href']
         href = item.abshref(base.urlnormalize(href))
         path, frag = urllib.parse.urldefrag(href)
         if path not in self.oeb.manifest.hrefs:
             continue
         title = base.xml2text(anchor)
         title = base.COLLAPSE_RE.sub(' ', title.strip())
         if href not in titles:
             order.append(href)
         titles[href].append(title)
     toc = self.oeb.toc
     for href in order:
         toc.add(' '.join(titles[href]), href)
     return True

Example #4

0

Show file

File: pdf_output.py Project: keshavbhatt/ebook-converter

    def process_fonts(self):
        ''' Make sure all fonts are embeddable '''
        from ebook_converter.ebooks.oeb.base import urlnormalize
        from ebook_converter.utils.fonts.utils import remove_embed_restriction

        processed = set()
        for item in list(self.oeb.manifest):
            if not hasattr(item.data, 'cssRules'):
                continue
            for i, rule in enumerate(item.data.cssRules):
                if rule.type == rule.FONT_FACE_RULE:
                    try:
                        s = rule.style
                        src = s.getProperty('src').propertyValue[0].uri
                    except:
                        continue
                    path = item.abshref(src)
                    ff = self.oeb.manifest.hrefs.get(urlnormalize(path), None)
                    if ff is None:
                        continue

                    raw = nraw = ff.data
                    if path not in processed:
                        processed.add(path)
                        try:
                            nraw = remove_embed_restriction(raw)
                        except:
                            continue
                        if nraw != raw:
                            ff.data = nraw
                            self.oeb.container.write(path, nraw)

Example #5

0

Show file

File: filenames.py Project: gryf/ebook-converter

    def __call__(self, oeb, opts):
        import css_parser
        self.log = oeb.logger
        self.opts = opts
        self.oeb = oeb

        for item in oeb.manifest.items:
            self.current_item = item
            if etree.iselement(item.data):
                rewrite_links(self.current_item.data, self.url_replacer)
            elif hasattr(item.data, 'cssText'):
                css_parser.replaceUrls(item.data, self.url_replacer)

        if self.oeb.guide:
            for ref in self.oeb.guide.values():
                href = urlnormalize(ref.href)
                href, frag = urllib.parse.urldefrag(href)
                replacement = self.rename_map.get(href, None)
                if replacement is not None:
                    nhref = replacement
                    if frag:
                        nhref += '#' + frag
                    ref.href = nhref

        if self.oeb.toc:
            self.fix_toc_entry(self.oeb.toc)

Example #6

0

Show file

 def serialize_href(self, href, _base=None):
     """
     Serialize the href attribute of an <a> or <reference> tag. It is
     serialized as filepos="000000000" and a pointer to its location is
     stored in self.href_offsets so that the correct value can be filled in
     at the end.
     """
     hrefs = self.oeb.manifest.hrefs
     try:
         path, frag = urllib.parse.urldefrag(base.urlnormalize(href))
     except ValueError:
         # Unparseable URL
         return False
     if path and _base:
         path = _base.abshref(path)
     if path and path not in hrefs:
         return False
     buf = self.buf
     item = hrefs[path] if path else None
     if item and item.spine_position is None:
         return False
     path = item.href if item else _base.href
     href = '#'.join((path, frag)) if frag else path
     buf.write(b'filepos=')
     self.href_offsets[href].append(buf.tell())
     buf.write(b'0000000000')
     return True

Example #7

0

Show file

def find_font_face_rules(sheet, oeb):
    '''
    Find all @font-face rules in the given sheet and extract the relevant info from them.
    sheet can be either a ManifestItem or a CSSStyleSheet.
    '''
    ans = []
    try:
        rules = sheet.data.cssRules
    except AttributeError:
        rules = sheet.cssRules

    for i, rule in enumerate(rules):
        if rule.type != rule.FONT_FACE_RULE:
            continue
        props = get_font_properties(rule, default='normal')
        if not props['font-family'] or not props['src']:
            continue

        try:
            path = sheet.abshref(props['src'])
        except AttributeError:
            path = props['src']
        ff = oeb.manifest.hrefs.get(urlnormalize(path), None)
        if not ff:
            continue
        props['item'] = ff
        if props['font-weight'] in {'bolder', 'lighter'}:
            props['font-weight'] = '400'
        props['weight'] = int(props['font-weight'])
        props['rule'] = rule
        props['chars'] = set()
        ans.append(props)

    return ans

Example #8

0

Show file

File: oeb2html.py Project: keshavbhatt/ebook-converter

 def rewrite_link(self, url, page=None):
     if not page:
         return url
     abs_url = page.abshref(base.urlnormalize(url))
     if abs_url in self.images:
         return 'images/%s' % self.images[abs_url]
     if abs_url in self.links:
         return self.links[abs_url]
     return url

Example #9

0

Show file

File: rasterize.py Project: gryf/ebook-converter

 def rasterize_item(self, item):
     html = item.data
     hrefs = self.oeb.manifest.hrefs
     for elem in xpath(html, '//h:img[@src]'):
         src = urlnormalize(elem.attrib['src'])
         image = hrefs.get(item.abshref(src), None)
         if image and image.media_type == SVG_MIME:
             style = self.stylizer(item).style(elem)
             self.rasterize_external(elem, style, item, image)
     for elem in xpath(html, '//h:object[@type="%s" and @data]' % SVG_MIME):
         data = urlnormalize(elem.attrib['data'])
         image = hrefs.get(item.abshref(data), None)
         if image and image.media_type == SVG_MIME:
             style = self.stylizer(item).style(elem)
             self.rasterize_external(elem, style, item, image)
     for elem in xpath(html, '//svg:svg'):
         style = self.stylizer(item).style(elem)
         self.rasterize_inline(elem, style, item)

Example #10

0

Show file

 def inspect_cover(self, href):
     from ebook_converter.ebooks.oeb.base import urlnormalize
     for x in self.oeb.manifest:
         if x.href == urlnormalize(href):
             try:
                 raw = x.data
                 return identify(raw)[1:]
             except Exception:
                 self.log.exception('Failed to read cover image dimensions')
     return -1, -1

Example #11

0

Show file

File: htmltoc.py Project: gryf/ebook-converter

    def __call__(self, oeb, context):
        has_toc = getattr(getattr(oeb, 'toc', False), 'nodes', False)

        if 'toc' in oeb.guide:
            # Ensure toc pointed to in <guide> is in spine
            from ebook_converter.ebooks.oeb.base import urlnormalize
            href = urlnormalize(oeb.guide['toc'].href)
            if href in oeb.manifest.hrefs:
                item = oeb.manifest.hrefs[href]
                if (hasattr(item.data, 'xpath') and
                        base.XPath('//h:a[@href]')(item.data)):
                    if oeb.spine.index(item) < 0:
                        if self.position == 'end':
                            oeb.spine.add(item, linear=False)
                        else:
                            oeb.spine.insert(0, item, linear=True)
                    return
                elif has_toc:
                    oeb.guide.remove('toc')
            else:
                oeb.guide.remove('toc')
        if not has_toc:
            return
        oeb.logger.info('Generating in-line TOC...')
        title = self.title or oeb.translate(DEFAULT_TITLE)
        style = self.style
        if style not in STYLE_CSS:
            oeb.logger.error('Unknown TOC style %r', style)
            style = 'nested'
        id, css_href = oeb.manifest.generate('tocstyle', 'tocstyle.css')
        oeb.manifest.add(id, css_href, base.CSS_MIME, data=STYLE_CSS[style])
        language = str(oeb.metadata.language[0])
        contents = base.element(None, base.tag('xhtml', 'html'),
                                nsmap={None: const.XHTML_NS},
                                attrib={base.tag('xml', 'lang'): language})
        head = base.element(contents, base.tag('xhtml', 'head'))
        htitle = base.element(head, base.tag('xhtml', 'title'))
        htitle.text = title
        base.element(head, base.tag('xhtml', 'link'), rel='stylesheet',
                     type=base.CSS_MIME, href=css_href)
        body = base.element(contents, base.tag('xhtml', 'body'),
                            attrib={'class': 'calibre_toc'})
        h1 = base.element(body, base.tag('xhtml', 'h2'),
                          attrib={'class': 'calibre_toc_header'})
        h1.text = title
        self.add_toc_level(body, oeb.toc)
        id, href = oeb.manifest.generate('contents', 'contents.xhtml')
        item = oeb.manifest.add(id, href, base.XHTML_MIME, data=contents)
        if self.position == 'end':
            oeb.spine.add(item, linear=False)
        else:
            oeb.spine.insert(0, item, linear=True)
        oeb.guide.add('toc', 'Table of Contents', href)

Example #12

0

Show file

File: filenames.py Project: gryf/ebook-converter

    def fix_toc_entry(self, toc):
        if toc.href:
            href = urlnormalize(toc.href)
            href, frag = urllib.parse.urldefrag(href)
            replacement = self.rename_map.get(href, None)

            if replacement is not None:
                nhref = replacement
                if frag:
                    nhref = '#'.join((nhref, frag))
                toc.href = nhref

        for x in toc:
            self.fix_toc_entry(x)

Example #13

0

Show file

File: jacket.py Project: keshavbhatt/ebook-converter

 def remove_images(self, item, limit=1):
     path = XPath('//h:img[@src]')
     removed = 0
     for img in path(item.data):
         if removed >= limit:
             break
         href = item.abshref(img.get('src'))
         image = self.oeb.manifest.hrefs.get(href)
         if image is None:
             href = urlnormalize(href)
             image = self.oeb.manifest.hrefs.get(href)
         if image is not None:
             self.oeb.manifest.remove(image)
             self.oeb.guide.remove_by_href(href)
             img.getparent().remove(img)
             removed += 1
     return removed

Example #14

0

Show file

 def _guide_from_opf(self, opf):
     guide = self.oeb.guide
     manifest = self.oeb.manifest
     for elem in base.xpath(opf, '/o2:package/o2:guide/o2:reference'):
         ref_href = elem.get('href')
         path = base.urlnormalize(urllib.parse.urldefrag(ref_href)[0])
         if path not in manifest.hrefs:
             corrected_href = None
             for href in manifest.hrefs:
                 if href.lower() == path.lower():
                     corrected_href = href
                     break
             if corrected_href is None:
                 self.logger.warn('Guide reference %r not found' % ref_href)
                 continue
             ref_href = corrected_href
         typ = elem.get('type')
         if typ not in guide:
             guide.add(typ, elem.get('title'), ref_href)

Example #15

0

Show file

File: filenames.py Project: gryf/ebook-converter

    def url_replacer(self, orig_url):
        url = urlnormalize(orig_url)
        parts = urllib.parse.urlparse(url)
        if parts.scheme:
            # Only rewrite local URLs
            return orig_url
        path, frag = urllib.parse.urldefrag(url)
        if self.renamed_items_map:
            orig_item = self.renamed_items_map.get(self.current_item.href,
                                                   self.current_item)
        else:
            orig_item = self.current_item

        href = orig_item.abshref(path)
        replacement = self.current_item.relhref(self.rename_map.get(
            href, href))
        if frag:
            replacement += '#' + frag
        return replacement

Example #16

0

Show file

File: rasterize.py Project: gryf/ebook-converter

 def dataize_svg(self, item, svg=None):
     if svg is None:
         svg = item.data
     hrefs = self.oeb.manifest.hrefs
     for elem in xpath(svg, '//svg:*[@xl:href]'):
         href = urlnormalize(elem.attrib[base.tag('xlink', 'href')])
         path = urllib.parse.urldefrag(href)[0]
         if not path:
             continue
         abshref = item.abshref(path)
         if abshref not in hrefs:
             continue
         linkee = hrefs[abshref]
         data = linkee.bytes_representation
         ext = what(None, data) or 'jpg'
         with PersistentTemporaryFile(suffix='.' + ext) as pt:
             pt.write(data)
             self.temp_files.append(pt.name)
         elem.attrib[base.tag('xlink', 'href')] = pt.name
     return svg

Example #17

0

Show file

 def _pages_from_page_map(self, opf):
     item = self._find_page_map(opf)
     if item is None:
         return False
     pmap = item.data
     pages = self.oeb.pages
     for page in base.xpath(pmap, 'o2:page'):
         name = page.get('name', '')
         href = page.get('href')
         if not href:
             continue
         name = base.COLLAPSE_RE.sub(' ', name.strip())
         href = item.abshref(base.urlnormalize(href))
         type = 'normal'
         if not name:
             type = 'special'
         elif name.lower().strip('ivxlcdm') == '':
             type = 'front'
         pages.add(name, href, type=type)
     return True

Example #18

0

Show file

 def _toc_from_tour(self, opf):
     result = base.xpath(opf, 'o2:tours/o2:tour')
     if not result:
         return False
     self.log.debug('Reading TOC from tour...')
     tour = result[0]
     toc = self.oeb.toc
     toc.title = tour.get('title')
     sites = base.xpath(tour, 'o2:site')
     for site in sites:
         title = site.get('title')
         href = site.get('href')
         if not title or not href:
             continue
         path, _ = urllib.parse.urldefrag(base.urlnormalize(href))
         if path not in self.oeb.manifest.hrefs:
             self.logger.warn('TOC reference %r not found' % href)
             continue
         id = site.get('id')
         toc.add(title, href, id=id)
     return True

Example #19

0

Show file

    def rewrite_links(self, url):
        href, frag = urllib.parse.urldefrag(url)
        try:
            href = self.current_item.abshref(href)
        except ValueError:
            # Unparseable URL
            return url
        try:
            href = base.urlnormalize(href)
        except ValueError:
            # href has non utf-8 quoting
            return url
        if href in self.map:
            anchor_map = self.map[href]
            nhref = anchor_map[frag if frag else None]
            nhref = self.current_item.relhref(nhref)
            if frag:
                nhref = '#'.join((polyglot.unquote(nhref), frag))

            return nhref
        return url

Example #20

0

Show file

 def _pages_from_ncx(self, opf, item):
     if item is None:
         return False
     ncx = item.data
     if ncx is None:
         return False
     ptargets = base.xpath(ncx, 'ncx:pageList/ncx:pageTarget')
     if not ptargets:
         return False
     pages = self.oeb.pages
     for ptarget in ptargets:
         name = ''.join(base.xpath(ptarget, 'ncx:navLabel/ncx:text/text()'))
         name = base.COLLAPSE_RE.sub(' ', name.strip())
         href = base.xpath(ptarget, 'ncx:content/@src')
         if not href:
             continue
         href = item.abshref(base.urlnormalize(href[0]))
         id = ptarget.get('id')
         type = ptarget.get('type', 'normal')
         klass = ptarget.get('class')
         pages.add(name, href, type=type, id=id, klass=klass)
     return True

Example #21

0

Show file

 def _spine_add_extra(self):
     manifest = self.oeb.manifest
     spine = self.oeb.spine
     unchecked = set(spine)
     selector = base.XPath('h:body//h:a/@href')
     extras = set()
     while unchecked:
         new = set()
         for item in unchecked:
             if item.media_type not in base.OEB_DOCS:
                 # TODO: handle fallback chains
                 continue
             for href in selector(item.data):
                 href, _ = urllib.parse.urldefrag(href)
                 if not href:
                     continue
                 try:
                     href = item.abshref(base.urlnormalize(href))
                 except ValueError:  # Malformed URL
                     continue
                 if href not in manifest.hrefs:
                     continue
                 found = manifest.hrefs[href]
                 if found.media_type not in base.OEB_DOCS or \
                    found in spine or found in extras:
                     continue
                 new.add(found)
         extras.update(new)
         unchecked = new
     version = int(self.oeb.version[0])
     removed_items_to_ignore = getattr(self.oeb, 'removed_items_to_ignore',
                                       ())
     for item in sorted(extras):
         if item.href in removed_items_to_ignore:
             continue
         if version >= 2:
             self.logger.warn('Spine-referenced file %r not in spine' %
                              item.href)
         spine.add(item, linear=False)

Example #22

0

Show file

        def serialize_toc_level(tocref, href=None):
            # add the provided toc level to the output stream
            # if href is provided add a link ref to the toc level output (e.g. feed_0/index.html)
            if href is not None:
                # resolve the section url in id_offsets
                buf.write(b'<mbp:pagebreak />')
                self.id_offsets[base.urlnormalize(href)] = buf.tell()

            if tocref.klass == "periodical":
                buf.write(b'<div> <div height="1em"></div>')
            else:
                t = tocref.title
                if isinstance(t, str):
                    t = t.encode('utf-8')
                buf.write(
                    b'<div></div> <div> <h2 height="1em"><font size="+2"><b>' +
                    t + b'</b></font></h2> <div height="1em"></div>')

            buf.write(b'<ul>')

            for tocitem in tocref.nodes:
                buf.write(b'<li><a filepos=')
                itemhref = tocitem.href
                if tocref.klass == 'periodical':
                    # This is a section node.
                    # For periodical tocs, the section urls are like r'feed_\d+/index.html'
                    # We dont want to point to the start of the first article
                    # so we change the href.
                    itemhref = re.sub(r'article_\d+/', '', itemhref)
                self.href_offsets[itemhref].append(buf.tell())
                buf.write(b'0000000000')
                buf.write(b' ><font size="+1"><b><u>')
                t = tocitem.title
                if isinstance(t, str):
                    t = t.encode('utf-8')
                buf.write(t)
                buf.write(b'</u></b></font></a></li>')

            buf.write(b'</ul><div height="1em"></div></div><mbp:pagebreak />')

Example #23

0

Show file

 def serialize_item(self, item):
     '''
     Serialize an individual item from the spine of the input document.
     A reference to this item is stored in self.href_offsets
     '''
     buf = self.buf
     if not item.linear:
         self.breaks.append(buf.tell() - 1)
     self.id_offsets[base.urlnormalize(item.href)] = buf.tell()
     if item.is_section_start:
         buf.write(b'<a ></a> ')
     if item.is_article_start:
         buf.write(b'<a ></a> <a ></a>')
     for elem in item.data.find(base.tag('xhtml', 'body')):
         self.serialize_elem(elem, item)
     if self.write_page_breaks_after_item:
         buf.write(b'<mbp:pagebreak/>')
     if item.is_article_end:
         # Kindle periodical article end marker
         buf.write(b'<a ></a> <a ></a>')
     if item.is_section_end:
         buf.write(b' <a ></a>')
     self.anchor_offset = None

Example #24

0

Show file

File: fb2ml.py Project: keshavbhatt/ebook-converter

    def dump_text(self, elem_tree, stylizer, page, tag_stack=[]):
        """
        This function is intended to be used in a recursive manner. dump_text
        will run though all elements in the elem_tree and call itself on each
        element.

        self.image_hrefs will be populated by calling this function.

        @param elem_tree: etree representation of XHTML content to be
            transformed.
        @param stylizer: Used to track the style of elements within the tree.
        @param page: OEB page used to determine absolute urls.
        @param tag_stack: List of open FB2 tags to take into account.

        @return: List of string representing the XHTML converted to FB2 markup.
        """
        elem = elem_tree

        # Ensure what we are converting is not a string and that the fist tag
        # is part of the XHTML namespace.
        if (not isinstance(elem_tree.tag, (str, bytes))
                or parse_utils.namespace(elem_tree.tag) != const.XHTML_NS):
            p = elem.getparent()
            if (p is not None and isinstance(p.tag, (str, bytes))
                    and parse_utils.namespace(p.tag) == const.XHTML_NS
                    and elem.tail):
                return [elem.tail]
            return []

        style = stylizer.style(elem_tree)
        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
           or style['visibility'] == 'hidden':
            if hasattr(elem, 'tail') and elem.tail:
                return [elem.tail]
            return []

        # FB2 generated output.
        fb2_out = []
        # FB2 tags in the order they are opened. This will be used to close
        # the tags.
        tags = []
        # First tag in tree
        tag = parse_utils.barename(elem_tree.tag)
        # Number of blank lines above tag
        try:
            ems = int(round((float(style.marginTop) / style.fontSize) - 1))
            if ems < 0:
                ems = 0
        except Exception:
            ems = 0

        # Convert TOC entries to <title>s and add <section>s
        if self.opts.sectionize == 'toc':
            # A section cannot be a child of any other element than another
            # section, so leave the tag alone if there are parents
            if not tag_stack:
                # There are two reasons to start a new section here: the TOC
                # pointed to this page (then we use the first non-<body> on
                # the page as a <title>), or the TOC pointed to a specific
                # element
                newlevel = 0
                toc_entry = self.toc.get(page.href, None)
                if toc_entry is not None:
                    if None in toc_entry:
                        if (tag != 'body' and hasattr(elem_tree, 'text')
                                and elem_tree.text):
                            newlevel = 1
                            self.toc[page.href] = None
                    if (not newlevel
                            and elem_tree.attrib.get('id', None) is not None):
                        newlevel = toc_entry.get(
                            elem_tree.attrib.get('id', None), None)

                # Start a new section if necessary
                if newlevel:
                    while newlevel <= self.section_level:
                        fb2_out.append('</section>')
                        self.section_level -= 1
                    fb2_out.append('<section>')
                    self.section_level += 1
                    fb2_out.append('<title>')
                    tags.append('title')
            if self.section_level == 0:
                # If none of the prior processing made a section, make one now
                # to be FB2 spec compliant
                fb2_out.append('<section>')
                self.section_level += 1

        # Process the XHTML tag and styles. Converted to an FB2 tag.
        # Use individual if statement not if else. There can be only one XHTML
        # tag but it can have multiple styles.
        if tag == 'img' and elem_tree.attrib.get('src', None):
            # Only write the image tag if it is in the manifest.
            ihref = base.urlnormalize(page.abshref(elem_tree.attrib['src']))
            if ihref in self.oeb_book.manifest.hrefs:
                if ihref not in self.image_hrefs:
                    self.image_hrefs[ihref] = 'img_%s' % len(self.image_hrefs)
                p_txt, p_tag = self.ensure_p()
                fb2_out += p_txt
                tags += p_tag
                fb2_out.append('<image l:href="#%s"/>' %
                               self.image_hrefs[ihref])
            else:
                self.log.warn(u'Ignoring image not in manifest: %s' % ihref)
        if tag in ('br', 'hr') or ems >= 1:
            if ems < 1:
                multiplier = 1
            else:
                multiplier = ems
            if self.in_p:
                closed_tags = []
                open_tags = tag_stack + tags
                open_tags.reverse()
                for t in open_tags:
                    fb2_out.append('</%s>' % t)
                    closed_tags.append(t)
                    if t == 'p':
                        break
                fb2_out.append('<empty-line/>' * multiplier)
                closed_tags.reverse()
                for t in closed_tags:
                    fb2_out.append('<%s>' % t)
            else:
                fb2_out.append('<empty-line/>' * multiplier)
        if tag in ('div', 'li', 'p'):
            p_text, added_p = self.close_open_p(tag_stack + tags)
            fb2_out += p_text
            if added_p:
                tags.append('p')
        if tag == 'a' and elem_tree.attrib.get('href', None):
            # Handle only external links for now
            if urllib.parse.urlparse(elem_tree.attrib['href']).netloc:
                p_txt, p_tag = self.ensure_p()
                fb2_out += p_txt
                tags += p_tag
                fb2_out.append('<a l:href="%s">' %
                               base.urlnormalize(elem_tree.attrib['href']))
                tags.append('a')
        if tag == 'b' or style['font-weight'] in ('bold', 'bolder'):
            s_out, s_tags = self.handle_simple_tag('strong', tag_stack + tags)
            fb2_out += s_out
            tags += s_tags
        if tag == 'i' or style['font-style'] == 'italic':
            s_out, s_tags = self.handle_simple_tag('emphasis',
                                                   tag_stack + tags)
            fb2_out += s_out
            tags += s_tags
        if (tag in ('del', 'strike')
                or style['text-decoration'] == 'line-through'):
            s_out, s_tags = self.handle_simple_tag('strikethrough',
                                                   tag_stack + tags)
            fb2_out += s_out
            tags += s_tags
        if tag == 'sub':
            s_out, s_tags = self.handle_simple_tag('sub', tag_stack + tags)
            fb2_out += s_out
            tags += s_tags
        if tag == 'sup':
            s_out, s_tags = self.handle_simple_tag('sup', tag_stack + tags)
            fb2_out += s_out
            tags += s_tags

        # Process element text.
        if hasattr(elem_tree, 'text') and elem_tree.text:
            if not self.in_p:
                fb2_out.append('<p>')
            fb2_out.append(prepare_string_for_xml(elem_tree.text))
            if not self.in_p:
                fb2_out.append('</p>')

        # Process sub-elements.
        for item in elem_tree:
            fb2_out += self.dump_text(item, stylizer, page, tag_stack + tags)

        # Close open FB2 tags.
        tags.reverse()
        fb2_out += self.close_tags(tags)

        # Process element text that comes after the close of the XHTML tag but
        # before the next XHTML tag.
        if hasattr(elem_tree, 'tail') and elem_tree.tail:
            if not self.in_p:
                fb2_out.append('<p>')
            fb2_out.append(prepare_string_for_xml(elem_tree.tail))
            if not self.in_p:
                fb2_out.append('</p>')

        return fb2_out

Example #25

0

Show file

    def serialize_body(self):
        '''
        Serialize all items in the spine of the document. Non linear items are
        moved to the end.
        '''
        buf = self.buf

        def serialize_toc_level(tocref, href=None):
            # add the provided toc level to the output stream
            # if href is provided add a link ref to the toc level output (e.g. feed_0/index.html)
            if href is not None:
                # resolve the section url in id_offsets
                buf.write(b'<mbp:pagebreak />')
                self.id_offsets[base.urlnormalize(href)] = buf.tell()

            if tocref.klass == "periodical":
                buf.write(b'<div> <div height="1em"></div>')
            else:
                t = tocref.title
                if isinstance(t, str):
                    t = t.encode('utf-8')
                buf.write(
                    b'<div></div> <div> <h2 height="1em"><font size="+2"><b>' +
                    t + b'</b></font></h2> <div height="1em"></div>')

            buf.write(b'<ul>')

            for tocitem in tocref.nodes:
                buf.write(b'<li><a filepos=')
                itemhref = tocitem.href
                if tocref.klass == 'periodical':
                    # This is a section node.
                    # For periodical tocs, the section urls are like r'feed_\d+/index.html'
                    # We dont want to point to the start of the first article
                    # so we change the href.
                    itemhref = re.sub(r'article_\d+/', '', itemhref)
                self.href_offsets[itemhref].append(buf.tell())
                buf.write(b'0000000000')
                buf.write(b' ><font size="+1"><b><u>')
                t = tocitem.title
                if isinstance(t, str):
                    t = t.encode('utf-8')
                buf.write(t)
                buf.write(b'</u></b></font></a></li>')

            buf.write(b'</ul><div height="1em"></div></div><mbp:pagebreak />')

        self.anchor_offset = buf.tell()
        buf.write(b'<body>')
        self.body_start_offset = buf.tell()

        if self.is_periodical:
            top_toc = self.oeb.toc.nodes[0]
            serialize_toc_level(top_toc)

        spine = [item for item in self.oeb.spine if item.linear]
        spine.extend([item for item in self.oeb.spine if not item.linear])

        for item in spine:

            if self.is_periodical and item.is_section_start:
                for section_toc in top_toc.nodes:
                    if base.urlnormalize(item.href) == section_toc.href:
                        # create section url of the form r'feed_\d+/index.html'
                        section_url = re.sub(r'article_\d+/', '',
                                             section_toc.href)
                        serialize_toc_level(section_toc, section_url)
                        section_toc.href = section_url
                        break

            self.serialize_item(item)

        self.body_end_offset = buf.tell()
        buf.write(b'</body>')

Example #26

0

Show file

    def _toc_from_navpoint(self, item, toc, navpoint):
        children = base.xpath(navpoint, 'ncx:navPoint')
        for child in children:
            title = ''.join(base.xpath(child, 'ncx:navLabel/ncx:text/text()'))
            title = base.COLLAPSE_RE.sub(' ', title.strip())
            href = base.xpath(child, 'ncx:content/@src')
            if not title:
                self._toc_from_navpoint(item, toc, child)
                continue
            if (not href
                    or not href[0]) and not base.xpath(child, 'ncx:navPoint'):
                # This node is useless
                continue
            if href and href[0]:
                href = item.abshref(base.urlnormalize(href[0]))
            else:
                href = ''
            path, _ = urllib.parse.urldefrag(href)
            if path and path not in self.oeb.manifest.hrefs:
                path = base.urlnormalize(path)
            if href and path not in self.oeb.manifest.hrefs:
                self.logger.warn('TOC reference %r not found' % href)
                gc = base.xpath(child, 'ncx:navPoint')
                if not gc:
                    # This node is useless
                    continue
            id = child.get('id')
            klass = child.get('class', 'chapter')

            try:
                po = int(child.get('playOrder',
                                   self.oeb.toc.next_play_order()))
            except Exception:
                po = self.oeb.toc.next_play_order()

            authorElement = base.xpath(
                child, 'descendant::calibre:meta[@name = "author"]')
            if authorElement:
                author = authorElement[0].text
            else:
                author = None

            descriptionElement = base.xpath(
                child, 'descendant::calibre:meta[@name = '
                '"description"]')
            if descriptionElement:
                description = etree.tostring(descriptionElement[0],
                                             method='text',
                                             encoding='unicode').strip()
                if not description:
                    description = None
            else:
                description = None

            index_image = base.xpath(
                child, 'descendant::calibre:meta[@name = '
                '"toc_thumbnail"]')
            toc_thumbnail = (index_image[0].text if index_image else None)
            if not toc_thumbnail or not toc_thumbnail.strip():
                toc_thumbnail = None

            node = toc.add(title,
                           href,
                           id=id,
                           klass=klass,
                           play_order=po,
                           description=description,
                           author=author,
                           toc_thumbnail=toc_thumbnail)

            self._toc_from_navpoint(item, node, child)

Example #27

0

Show file

File: html_input.py Project: keshavbhatt/ebook-converter

    def create_oebbook(self, htmlpath, basedir, opts, log, mi):
        import uuid
        from ebook_converter.ebooks.conversion.plumber import create_oebbook
        from ebook_converter.ebooks.oeb.base import (DirContainer,
            rewrite_links, urlnormalize, BINARY_MIME, OEB_STYLES,
            xpath, urlquote)
        from ebook_converter.ebooks.oeb.transforms.metadata import \
            meta_info_to_oeb_metadata
        from ebook_converter.ebooks.html.input import get_filelist
        from ebook_converter.ebooks.metadata import string_to_authors
        from ebook_converter.utils.localization import canonicalize_lang
        import css_parser, logging
        css_parser.log.setLevel(logging.WARN)
        self.OEB_STYLES = OEB_STYLES
        oeb = create_oebbook(log, None, opts, self,
                encoding=opts.input_encoding, populate=False)
        self.oeb = oeb

        metadata = oeb.metadata
        meta_info_to_oeb_metadata(mi, metadata, log)
        if not metadata.language:
            l = canonicalize_lang(getattr(opts, 'language', None))
            if not l:
                oeb.logger.warn('Language not specified')
                l = get_lang().replace('_', '-')
            metadata.add('language', l)
        if not metadata.creator:
            a = getattr(opts, 'authors', None)
            if a:
                a = string_to_authors(a)
            if not a:
                oeb.logger.warn('Creator not specified')
                a = [self.oeb.translate('Unknown')]
            for aut in a:
                metadata.add('creator', aut)
        if not metadata.title:
            oeb.logger.warn('Title not specified')
            metadata.add('title', self.oeb.translate('Unknown'))
        bookid = str(uuid.uuid4())
        metadata.add('identifier', bookid, id='uuid_id', scheme='uuid')
        for ident in metadata.identifier:
            if 'id' in ident.attrib:
                self.oeb.uid = metadata.identifier[0]
                break

        filelist = get_filelist(htmlpath, basedir, opts, log)
        filelist = [f for f in filelist if not f.is_binary]
        htmlfile_map = {}
        for f in filelist:
            path = f.path
            oeb.container = DirContainer(os.path.dirname(path), log,
                    ignore_opf=True)
            bname = os.path.basename(path)
            id, href = oeb.manifest.generate(id='html', href=sanitize_file_name(bname))
            htmlfile_map[path] = href
            item = oeb.manifest.add(id, href, 'text/html')
            if path == htmlpath and '%' in path:
                bname = urlquote(bname)
            item.html_input_href = bname
            oeb.spine.add(item, True)

        self.added_resources = {}
        self.log = log
        self.log('Normalizing filename cases')
        for path, href in htmlfile_map.items():
            self.added_resources[path] = href
        self.urlnormalize, self.DirContainer = urlnormalize, DirContainer
        self.urldefrag = urllib.parse.urldefrag
        self.BINARY_MIME = BINARY_MIME

        self.log('Rewriting HTML links')
        for f in filelist:
            path = f.path
            dpath = os.path.dirname(path)
            oeb.container = DirContainer(dpath, log, ignore_opf=True)
            href = htmlfile_map[path]
            try:
                item = oeb.manifest.hrefs[href]
            except KeyError:
                item = oeb.manifest.hrefs[urlnormalize(href)]
            rewrite_links(item.data,
                          functools.partial(self.resource_adder, base=dpath))

        for item in oeb.manifest.values():
            if item.media_type in self.OEB_STYLES:
                dpath = None
                for path, href in self.added_resources.items():
                    if href == item.href:
                        dpath = os.path.dirname(path)
                        break
                css_parser.replaceUrls(item.data,
                        functools.partial(self.resource_adder, base=dpath))

        toc = self.oeb.toc
        self.oeb.auto_generated_toc = True
        titles = []
        headers = []
        for item in self.oeb.spine:
            if not item.linear:
                continue
            html = item.data
            title = ''.join(xpath(html, '/h:html/h:head/h:title/text()'))
            title = re.sub(r'\s+', ' ', title.strip())
            if title:
                titles.append(title)
            headers.append('(unlabled)')
            for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'):
                expr = '/h:html/h:body//h:%s[position()=1]/text()'
                header = ''.join(xpath(html, expr % tag))
                header = re.sub(r'\s+', ' ', header.strip())
                if header:
                    headers[-1] = header
                    break
        use = titles
        if len(titles) > len(set(titles)):
            use = headers
        for title, item in zip(use, self.oeb.spine):
            if not item.linear:
                continue
            toc.add(title, item.href)

        oeb.container = DirContainer(os.getcwd(), oeb.log, ignore_opf=True)
        return oeb

Example #28

0

Show file

    def convert_epub3_nav(self, nav_path, opf, log, opts):
        from lxml import etree
        from ebook_converter.ebooks.chardet import xml_to_unicode
        from ebook_converter.ebooks.oeb.polish.parsing import parse
        from ebook_converter.ebooks.oeb.base import \
            serialize
        from ebook_converter.ebooks.oeb.polish.toc import first_child
        from tempfile import NamedTemporaryFile
        with open(nav_path, 'rb') as f:
            raw = f.read()
        raw = xml_to_unicode(raw, strip_encoding_pats=True,
                             assume_utf8=True)[0]
        root = parse(raw, log=log)
        ncx = etree.fromstring('<ncx xmlns="http://www.daisy.org/z3986/2005/'
                               'ncx/" version="2005-1" xml:lang="eng">'
                               '<navMap/></ncx>')
        navmap = ncx[0]
        et = '{%s}type' % const.EPUB_NS
        bn = os.path.basename(nav_path)

        def add_from_li(li, parent):
            href = text = None
            for x in li.iterchildren(base.tag('xhtml', 'a'),
                                     base.tag('xhtml', 'span')):
                text = etree.tostring(
                    x, method='text', encoding='unicode',
                    with_tail=False).strip() or ' '.join(
                        x.xpath('descendant-or-self::*/@title')).strip()
                href = x.get('href')
                if href:
                    if href.startswith('#'):
                        href = bn + href
                break
            np = parent.makeelement(base.tag('ncx', 'navPoint'))
            parent.append(np)
            np.append(np.makeelement(base.tag('ncx', 'navLabel')))
            np[0].append(np.makeelement(base.tag('ncx', 'text')))
            np[0][0].text = text
            if href:
                np.append(
                    np.makeelement(base.tag('ncx', 'content'),
                                   attrib={'src': href}))
            return np

        def process_nav_node(node, toc_parent):
            for li in node.iterchildren(base.tag('xhtml', 'li')):
                child = add_from_li(li, toc_parent)
                ol = first_child(li, base.tag('xhtml', 'ol'))
                if child is not None and ol is not None:
                    process_nav_node(ol, child)

        for nav in root.iterdescendants(base.tag('xhtml', 'nav')):
            if nav.get(et) == 'toc':
                ol = first_child(nav, base.tag('xhtml', 'ol'))
                if ol is not None:
                    process_nav_node(ol, navmap)
                    break
        else:
            return

        with NamedTemporaryFile(suffix='.ncx',
                                dir=os.path.dirname(nav_path),
                                delete=False) as f:
            f.write(etree.tostring(ncx, encoding='utf-8'))
        ncx_href = os.path.relpath(f.name, os.getcwd()).replace(os.sep, '/')
        ncx_id = opf.create_manifest_item(ncx_href, base.NCX_MIME,
                                          append=True).get('id')
        for spine in opf.root.xpath('//*[local-name()="spine"]'):
            spine.set('toc', ncx_id)
        url = os.path.relpath(nav_path).replace(os.sep, '/')
        opts.epub3_nav_href = base.urlnormalize(url)
        opts.epub3_nav_parsed = root
        if getattr(self, 'removed_cover', None):
            changed = False
            base_path = os.path.dirname(nav_path)
            for elem in root.xpath('//*[@href]'):
                href, frag = elem.get('href').partition('#')[::2]
                link_path = (os.path.relpath(
                    os.path.join(base_path, urllib.parse.unquote(href)),
                    base_path))
                abs_href = base.urlnormalize(link_path)
                if abs_href == self.removed_cover:
                    changed = True
                    elem.set('data-calibre-removed-titlepage', '1')
            if changed:
                with open(nav_path, 'wb') as f:
                    f.write(base.serialize(root, 'application/xhtml+xml'))

Example #29

0

Show file

File: mobiml.py Project: keshavbhatt/ebook-converter

    def mobimlize_elem(self, elem, stylizer, bstate, istates,
            ignore_valign=False):
        if not isinstance(elem.tag, (str, bytes)) \
           or parse_utils.namespace(elem.tag) != const.XHTML_NS:
            return
        style = stylizer.style(elem)
        # <mbp:frame-set/> does not exist lalalala
        if ((style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') or style['visibility'] == 'hidden') and
                elem.get('data-calibre-jacket-searchable-tags', None) != '1'):
            id_ = elem.get('id', None)
            if id_:
                # Keep anchors so people can use display:none
                # to generate hidden TOCs
                tail = elem.tail
                elem.clear()
                elem.text = None
                elem.set('id', id_)
                elem.tail = tail
                elem.tag = base.tag('xhtml', 'a')
            else:
                return
        tag = parse_utils.barename(elem.tag)
        istate = copy.copy(istates[-1])
        istate.rendered = False
        istate.list_num = 0
        if tag == 'ol' and 'start' in elem.attrib:
            try:
                istate.list_num = int(elem.attrib['start'])-1
            except:
                pass
        istates.append(istate)
        left = 0
        display = style['display']
        if display == 'table-cell':
            display = 'inline'
        elif display.startswith('table'):
            display = 'block'
        isblock = (not display.startswith('inline') and style['display'] !=
                'none')
        isblock = isblock and style['float'] == 'none'
        isblock = isblock and tag != 'br'
        if isblock:
            bstate.para = None
            istate.halign = style['text-align']
            rawti = style._get('text-indent')
            istate.indent = style['text-indent']
            if hasattr(rawti, 'strip') and '%' in rawti:
                # We have a percentage text indent, these can come out looking
                # too large if the user chooses a wide output profile like
                # tablet
                istate.indent = min(style._unit_convert(rawti, base=500), istate.indent)
            if style['margin-left'] == 'auto' \
               and style['margin-right'] == 'auto':
                istate.halign = 'center'
            margin = asfloat(style['margin-left'])
            padding = asfloat(style['padding-left'])
            if tag != 'body':
                left = margin + padding
            istate.left += left
            vmargin = asfloat(style['margin-top'])
            bstate.vmargin = max((bstate.vmargin, vmargin))
            vpadding = asfloat(style['padding-top'])
            if vpadding > 0:
                bstate.vpadding += bstate.vmargin
                bstate.vmargin = 0
                bstate.vpadding += vpadding
        elif not istate.href:
            margin = asfloat(style['margin-left'])
            padding = asfloat(style['padding-left'])
            lspace = margin + padding
            if lspace > 0:
                spaces = int(round((lspace * 3) / style['font-size']))
                elem.text = ('\xa0' * spaces) + (elem.text or '')
            margin = asfloat(style['margin-right'])
            padding = asfloat(style['padding-right'])
            rspace = margin + padding
            if rspace > 0:
                spaces = int(round((rspace * 3) / style['font-size']))
                if len(elem) == 0:
                    elem.text = (elem.text or '') + ('\xa0' * spaces)
                else:
                    last = elem[-1]
                    last.text = (last.text or '') + ('\xa0' * spaces)
        if bstate.content and style['page-break-before'] in PAGE_BREAKS:
            bstate.pbreak = True
        istate.fsize = self.mobimlize_font(style['font-size'])
        istate.italic = True if style['font-style'] == 'italic' else False
        weight = style['font-weight']
        istate.bold = weight in ('bold', 'bolder') or asfloat(weight) > 400
        istate.preserve = style['white-space'] == 'pre'
        istate.pre_wrap = style['white-space'] == 'pre-wrap'
        istate.bgcolor  = style['background-color']
        istate.fgcolor  = style['color']
        istate.strikethrough = style.effective_text_decoration == 'line-through'
        istate.underline = style.effective_text_decoration == 'underline'
        ff = style['font-family'].lower() if hasattr(style['font-family'], 'lower') else ''
        if 'monospace' in ff or 'courier' in ff or ff.endswith(' mono'):
            istate.family = 'monospace'
        elif ('sans-serif' in ff or 'sansserif' in ff or 'verdana' in ff or
                'arial' in ff or 'helvetica' in ff):
            istate.family = 'sans-serif'
        else:
            istate.family = 'serif'
        if 'id' in elem.attrib:
            istate.ids.add(elem.attrib['id'])
        if 'name' in elem.attrib:
            istate.ids.add(elem.attrib['name'])
        if tag == 'a' and 'href' in elem.attrib:
            istate.href = elem.attrib['href']
        istate.attrib.clear()
        if tag == 'img' and 'src' in elem.attrib:
            istate.attrib['src'] = elem.attrib['src']
            istate.attrib['align'] = 'baseline'
            cssdict = style.cssdict()
            valign = cssdict.get('vertical-align', None)
            if valign in ('top', 'bottom', 'middle'):
                istate.attrib['align'] = valign
            for prop in ('width', 'height'):
                if cssdict[prop] != 'auto':
                    value = style[prop]
                    if value == getattr(self.profile, prop):
                        result = '100%'
                    else:
                        # Amazon's renderer does not support
                        # img sizes in units other than px
                        # See #7520 for test case
                        try:
                            pixs = int(round(float(value) /
                                (72/self.profile.dpi)))
                        except:
                            continue
                        result = str(pixs)
                    istate.attrib[prop] = result
            if 'width' not in istate.attrib or 'height' not in istate.attrib:
                href = self.current_spine_item.abshref(elem.attrib['src'])
                try:
                    item = self.oeb.manifest.hrefs[base.urlnormalize(href)]
                except:
                    self.oeb.logger.warn('Failed to find image:',
                            href)
                else:
                    try:
                        width, height = identify(item.data)[1:]
                    except Exception:
                        self.oeb.logger.warn('Invalid image:', href)
                    else:
                        if 'width' not in istate.attrib and 'height' not in \
                                    istate.attrib:
                            istate.attrib['width'] = str(width)
                            istate.attrib['height'] = str(height)
                        else:
                            ar = width / height
                            if 'width' not in istate.attrib:
                                try:
                                    width = int(istate.attrib['height'])*ar
                                except:
                                    pass
                                istate.attrib['width'] = str(int(width))
                            else:
                                try:
                                    height = int(istate.attrib['width'])/ar
                                except:
                                    pass
                                istate.attrib['height'] = str(int(height))
                        item.unload_data_from_memory()
        elif tag == 'hr' and asfloat(style['width']) > 0 and style._get('width') not in {'100%', 'auto'}:
            raww = style._get('width')
            if hasattr(raww, 'strip') and '%' in raww:
                istate.attrib['width'] = raww
            else:
                prop = style['width'] / self.profile.width
                istate.attrib['width'] = "%d%%" % int(round(prop * 100))
        elif display == 'table':
            tag = 'table'
        elif display == 'table-row':
            tag = 'tr'
        elif display == 'table-cell':
            tag = 'td'
        if tag in TABLE_TAGS and self.ignore_tables:
            tag = 'span' if tag == 'td' else 'div'

        if tag in ('table', 'td', 'tr'):
            col = style.backgroundColor
            if col:
                elem.set('bgcolor', col)
            css = style.cssdict()
            if 'border' in css or 'border-width' in css:
                elem.set('border', '1')
        if tag in TABLE_TAGS:
            for attr in ('rowspan', 'colspan', 'width', 'border', 'scope',
                    'bgcolor'):
                if attr in elem.attrib:
                    istate.attrib[attr] = elem.attrib[attr]
        if tag == 'q':
            t = elem.text
            if not t:
                t = ''
            elem.text = '\u201c' + t
            t = elem.tail
            if not t:
                t = ''
            elem.tail = '\u201d' + t
        text = None
        if elem.text:
            if istate.preserve or istate.pre_wrap:
                text = elem.text
            elif (len(elem) > 0 and isspace(elem.text) and hasattr(elem[0].tag, 'rpartition') and
                  elem[0].tag.rpartition('}')[-1] not in INLINE_TAGS):
                text = None
            else:
                text = COLLAPSE.sub(' ', elem.text)
        valign = style['vertical-align']
        not_baseline = valign in ('super', 'sub', 'text-top',
                'text-bottom', 'top', 'bottom') or (
                isinstance(valign, numbers.Number) and abs(valign) != 0)
        issup = valign in ('super', 'text-top', 'top') or (
            isinstance(valign, numbers.Number) and valign > 0)
        vtag = 'sup' if issup else 'sub'
        if not_baseline and not ignore_valign and tag not in NOT_VTAGS and not isblock:
            nroot = etree.Element(base.tag('xhtml', 'html'), nsmap=MOBI_NSMAP)
            vbstate = BlockState(etree.SubElement(nroot, base.tag('xhtml', 'body')))
            vbstate.para = etree.SubElement(vbstate.body, base.tag('xhtml', 'p'))
            self.mobimlize_elem(elem, stylizer, vbstate, istates,
                    ignore_valign=True)
            if len(istates) > 0:
                istates.pop()
            if len(istates) == 0:
                istates.append(FormatState())
            at_start = bstate.para is None
            if at_start:
                self.mobimlize_content('span', '', bstate, istates)
            parent = bstate.para if bstate.inline is None else bstate.inline
            if parent is not None:
                vtag = etree.SubElement(parent, base.tag('xhtml', vtag))
                vtag = etree.SubElement(vtag, base.tag('xhtml', 'small'))
                # Add anchors
                for child in vbstate.body:
                    if child is not vbstate.para:
                        vtag.append(child)
                    else:
                        break
                if vbstate.para is not None:
                    if vbstate.para.text:
                        vtag.text = vbstate.para.text
                    for child in vbstate.para:
                        vtag.append(child)
                return

        if tag == 'blockquote':
            old_mim = self.opts.mobi_ignore_margins
            self.opts.mobi_ignore_margins = False

        if (text or tag in CONTENT_TAGS or tag in NESTABLE_TAGS or (
                # We have an id but no text and no children, the id should still
                # be added.
                istate.ids and tag in ('a', 'span', 'i', 'b', 'u') and
                len(elem)==0)):
            if tag == 'li' and len(istates) > 1 and 'value' in elem.attrib:
                try:
                    value = int(elem.attrib['value'])
                    istates[-2].list_num = value - 1
                except:
                    pass
            self.mobimlize_content(tag, text, bstate, istates)
        for child in elem:
            self.mobimlize_elem(child, stylizer, bstate, istates)
            tail = None
            if child.tail:
                if istate.preserve or istate.pre_wrap:
                    tail = child.tail
                elif bstate.para is None and isspace(child.tail):
                    tail = None
                else:
                    tail = COLLAPSE.sub(' ', child.tail)
            if tail:
                self.mobimlize_content(tag, tail, bstate, istates)

        if tag == 'blockquote':
            self.opts.mobi_ignore_margins = old_mim

        if bstate.content and style['page-break-after'] in PAGE_BREAKS:
            bstate.pbreak = True
        if isblock:
            para = bstate.para
            if para is not None and para.text == '\xa0' and len(para) < 1:
                if style.height > 2:
                    para.getparent().replace(para, etree.Element(base.tag('xhtml', 'br')))
                else:
                    # This is too small to be rendered effectively, drop it
                    para.getparent().remove(para)
            bstate.para = None
            bstate.istate = None
            vmargin = asfloat(style['margin-bottom'])
            bstate.vmargin = max((bstate.vmargin, vmargin))
            vpadding = asfloat(style['padding-bottom'])
            if vpadding > 0:
                bstate.vpadding += bstate.vmargin
                bstate.vmargin = 0
                bstate.vpadding += vpadding
        if bstate.nested and bstate.nested[-1].tag == elem.tag:
            bstate.nested.pop()
        istates.pop()

Example #30

0

Show file

File: stylizer.py Project: gryf/ebook-converter

    def __init__(self, tree, path, oeb, opts, profile=None,
            extra_css='', user_css='', base_css=''):
        self.oeb, self.opts = oeb, opts
        self.profile = profile
        if self.profile is None:
            # Use the default profile. This should really be using
            # opts.output_profile, but I don't want to risk changing it, as
            # doing so might well have hard to debug font size effects.
            from ebook_converter.customize.ui import output_profiles
            for x in output_profiles():
                if x.short_name == 'default':
                    self.profile = x
                    break
        if self.profile is None:
            # Just in case the default profile is removed in the future :)
            self.profile = opts.output_profile
        self.body_font_size = self.profile.fbase
        self.logger = oeb.logger
        item = oeb.manifest.hrefs[path]
        basename = os.path.basename(path)
        cssname = os.path.splitext(basename)[0] + '.css'
        stylesheets = [html_css_stylesheet()]
        if base_css:
            stylesheets.append(parseString(base_css, validate=False))
        style_tags = base.xpath(tree, '//*[local-name()="style" or local-name()="link"]')

        # Add css_parser parsing profiles from output_profile
        for profile in self.opts.output_profile.extra_css_modules:
            cssprofiles.addProfile(profile['name'],
                                        profile['props'],
                                        profile['macros'])

        parser = CSSParser(fetcher=self._fetch_css_file,
                log=logging.getLogger('calibre.css'))
        for elem in style_tags:
            if (elem.tag == base.tag('xhtml', 'style') and elem.get('type', base.CSS_MIME) in base.OEB_STYLES and media_ok(elem.get('media'))):
                text = elem.text if elem.text else ''
                for x in elem:
                    t = getattr(x, 'text', None)
                    if t:
                        text += '\n\n' + uenc.force_unicode(t, 'utf-8')
                    t = getattr(x, 'tail', None)
                    if t:
                        text += '\n\n' + uenc.force_unicode(t, 'utf-8')
                if text:
                    text = oeb.css_preprocessor(text)
                    # We handle @import rules separately
                    parser.setFetcher(lambda x: ('utf-8', b''))
                    stylesheet = parser.parseString(text, href=cssname,
                            validate=False)
                    parser.setFetcher(self._fetch_css_file)
                    for rule in stylesheet.cssRules:
                        if rule.type == rule.IMPORT_RULE:
                            ihref = item.abshref(rule.href)
                            if not media_ok(rule.media.mediaText):
                                continue
                            hrefs = self.oeb.manifest.hrefs
                            if ihref not in hrefs:
                                self.logger.warning('Ignoring missing '
                                                    'stylesheet in @import '
                                                    'rule: %s', rule.href)
                                continue
                            sitem = hrefs[ihref]
                            if sitem.media_type not in base.OEB_STYLES:
                                self.logger.warning('CSS @import of non-CSS '
                                                    'file %r', rule.href)
                                continue
                            stylesheets.append(sitem.data)
                    # Make links to resources absolute, since these rules will
                    # be folded into a stylesheet at the root
                    replaceUrls(stylesheet, item.abshref,
                            ignoreImportRules=True)
                    stylesheets.append(stylesheet)
            elif (elem.tag == base.tag('xhtml', 'link') and elem.get('href') and elem.get(
                    'rel', 'stylesheet').lower() == 'stylesheet' and elem.get(
                    'type', base.CSS_MIME).lower() in base.OEB_STYLES and media_ok(elem.get('media'))
                ):
                href = base.urlnormalize(elem.attrib['href'])
                path = item.abshref(href)
                sitem = oeb.manifest.hrefs.get(path, None)
                if sitem is None:
                    self.logger.warning('Stylesheet %r referenced by file %r '
                                        'not in manifest', path, item.href)
                    continue
                if not hasattr(sitem.data, 'cssRules'):
                    self.logger.warning('Stylesheet %r referenced by file %r '
                                        'is not CSS', path, item.href)
                    continue
                stylesheets.append(sitem.data)
        csses = {'extra_css':extra_css, 'user_css':user_css}
        for w, x in csses.items():
            if x:
                try:
                    text = x
                    stylesheet = parser.parseString(text, href=cssname,
                            validate=False)
                    stylesheets.append(stylesheet)
                except Exception:
                    self.logger.exception('Failed to parse %s, ignoring.', w)
                    self.logger.debug('Bad css: %s', x)

        # using oeb to store the rules, page rule and font face rules
        # and generating them again if opts, profile or stylesheets are different
        if (not hasattr(self.oeb, 'stylizer_rules')) \
            or not self.oeb.stylizer_rules.same_rules(self.opts, self.profile, stylesheets):
            self.oeb.stylizer_rules = StylizerRules(self.opts, self.profile, stylesheets)
        self.rules = self.oeb.stylizer_rules.rules
        self.page_rule = self.oeb.stylizer_rules.page_rule
        self.font_face_rules = self.oeb.stylizer_rules.font_face_rules
        self.flatten_style = self.oeb.stylizer_rules.flatten_style

        self._styles = {}
        pseudo_pat = re.compile(':{1,2}(%s)' % ('|'.join(INAPPROPRIATE_PSEUDO_CLASSES)), re.I)
        select = Select(tree, ignore_inappropriate_pseudo_classes=True)

        for _, _, cssdict, text, _ in self.rules:
            fl = pseudo_pat.search(text)
            try:
                matches = tuple(select(text))
            except SelectorError as err:
                self.logger.error('Ignoring CSS rule with invalid selector: '
                                  '%r (%s)', text, err)
                continue

            if fl is not None:
                fl = fl.group(1)
                if fl == 'first-letter' and getattr(self.oeb,
                        'plumber_output_format', '').lower() in {'mobi', 'docx'}:
                    # Fake first-letter
                    for elem in matches:
                        for x in elem.iter('*'):
                            if x.text:
                                punctuation_chars = []
                                text = str(x.text)
                                while text:
                                    category = unicodedata.category(text[0])
                                    if category[0] not in {'P', 'Z'}:
                                        break
                                    punctuation_chars.append(text[0])
                                    text = text[1:]

                                special_text = ''.join(punctuation_chars) + \
                                        (text[0] if text else '')
                                span = x.makeelement('{%s}span' %
                                                     const.XHTML_NS)
                                span.text = special_text
                                span.set('data-fake-first-letter', '1')
                                span.tail = text[1:]
                                x.text = None
                                x.insert(0, span)
                                self.style(span)._update_cssdict(cssdict)
                                break
                else:  # Element pseudo-class
                    for elem in matches:
                        self.style(elem)._update_pseudo_class(fl, cssdict)
            else:
                for elem in matches:
                    self.style(elem)._update_cssdict(cssdict)
        for elem in base.xpath(tree, '//h:*[@style]'):
            self.style(elem)._apply_style_attr(url_replacer=item.abshref)
        num_pat = re.compile(r'[0-9.]+$')
        for elem in base.xpath(tree, '//h:img[@width or @height]'):
            style = self.style(elem)
            # Check if either height or width is not default
            is_styled = style._style.get('width', 'auto') != 'auto' or \
                    style._style.get('height', 'auto') != 'auto'
            if not is_styled:
                # Update img style dimension using width and height
                upd = {}
                for prop in ('width', 'height'):
                    val = elem.get(prop, '').strip()
                    try:
                        del elem.attrib[prop]
                    except:
                        pass
                    if val:
                        if num_pat.match(val) is not None:
                            val += 'px'
                        upd[prop] = val
                if upd:
                    style._update_cssdict(upd)