Python XHTML Examples, ebook_converter.ebooks.oeb.base.XHTML Python Examples

Example #1

0

Show file

File: mobi_output.py Project: gryf/ebook-converter

 def workaround_fire_bugs(self, jacket):
     # The idiotic Fire crashes when trying to render the table used to
     # layout the jacket
     from ebook_converter.ebooks.oeb.base import XHTML
     for table in jacket.data.xpath('//*[local-name()="table"]'):
         table.tag = XHTML('div')
         for tr in table.xpath('descendant::*[local-name()="tr"]'):
             cols = tr.xpath('descendant::*[local-name()="td"]')
             tr.tag = XHTML('div')
             for td in cols:
                 td.tag = XHTML('span' if cols else 'div')

Example #2

0

Show file

File: replace.py Project: gryf/ebook-converter

def remove_links_to(container, predicate):
    ''' predicate must be a function that takes the arguments (name, href,
    fragment=None) and returns True iff the link should be removed '''
    from ebook_converter.ebooks.oeb.base import iterlinks, OEB_DOCS, OEB_STYLES, XPath, XHTML
    stylepath = XPath('//h:style')
    styleattrpath = XPath('//*[@style]')
    changed = set()
    for name, mt in container.mime_map.items():
        removed = False
        if mt in OEB_DOCS:
            root = container.parsed(name)
            for el, attr, href, pos in iterlinks(root,
                                                 find_links_in_css=False):
                hname = container.href_to_name(href, name)
                frag = href.partition('#')[-1]
                if predicate(hname, href, frag):
                    if attr is None:
                        el.text = None
                    else:
                        if el.tag == XHTML('link') or el.tag == XHTML('img'):
                            extract(el)
                        else:
                            del el.attrib[attr]
                    removed = True
            for tag in stylepath(root):
                if tag.text and (tag.get('type')
                                 or 'text/css').lower() == 'text/css':
                    sheet = container.parse_css(tag.text)
                    if remove_links_in_sheet(
                            partial(container.href_to_name, base=name), sheet,
                            predicate):
                        tag.text = css_text(sheet)
                        removed = True
            for tag in styleattrpath(root):
                style = tag.get('style')
                if style:
                    style = container.parse_css(style, is_declaration=True)
                    if remove_links_in_declaration(
                            partial(container.href_to_name, base=name), style,
                            predicate):
                        removed = True
                        tag.set('style', css_text(style))
        elif mt in OEB_STYLES:
            removed = remove_links_in_sheet(
                partial(container.href_to_name, base=name),
                container.parsed(name), predicate)
        if removed:
            changed.add(name)
    tuple(map(container.dirty, changed))
    return changed

Example #3

0

Show file

 def mlize_spine(self, oeb_book):
     output = ['']
     for item in oeb_book.spine:
         self.log.debug('Converting %s to Markdown formatted TXT...' % item.href)
         self.rewrite_ids(item.data, item)
         rewrite_links(item.data, partial(self.rewrite_link, page=item))
         stylizer = Stylizer(item.data, item.href, oeb_book, self.opts, self.opts.output_profile)
         output += self.dump_text(item.data.find(XHTML('body')), stylizer)
         output.append('\n\n')
     return ''.join(output)

Example #4

0

Show file

 def postprocess_book(self, oeb, opts, log):
     from ebook_converter.ebooks.oeb.base import XPath, XHTML
     for item in oeb.spine:
         root = item.data
         if not hasattr(root, 'xpath'):
             continue
         for bad in ('metadata', 'guide'):
             metadata = XPath('//h:'+bad)(root)
             if metadata:
                 for x in metadata:
                     x.getparent().remove(x)
         body = XPath('//h:body')(root)
         if body:
             body = body[0]
             if len(body) == 1 and body[0].tag == XHTML('pre'):
                 pre = body[0]
                 from ebook_converter.ebooks.txt.processor import \
                     convert_basic, separate_paragraphs_single_line
                 from ebook_converter.ebooks.chardet import xml_to_unicode
                 self.log('LIT file with all text in singe <pre> tag '
                          'detected')
                 html = separate_paragraphs_single_line(pre.text)
                 html = convert_basic(html).replace('<html>',
                                                    '<html xmlns="%s">' %
                                                    const.XHTML_NS)
                 html = xml_to_unicode(html, strip_encoding_pats=True,
                                       resolve_entities=True)[0]
                 if opts.smarten_punctuation:
                     # SmartyPants skips text inside <pre> tags
                     from ebook_converter.ebooks.conversion import \
                             preprocess
                     html = preprocess.smarten_punctuation(html, self.log)
                 root = etree.fromstring(html)
                 body = XPath('//h:body')(root)
                 pre.tag = XHTML('div')
                 pre.text = ''
                 for elem in body:
                     ne = copy.deepcopy(elem)
                     pre.append(ne)

Example #5

0

Show file

def link_stylesheets(container, names, sheets, remove=False, mtype='text/css'):
    from ebook_converter.ebooks.oeb.base import XPath, XHTML
    changed_names = set()
    snames = set(sheets)
    lp = XPath('//h:link[@href]')
    hp = XPath('//h:head')
    for name in names:
        root = container.parsed(name)
        if remove:
            for link in lp(root):
                if (link.get('type', mtype) or mtype) == mtype:
                    container.remove_from_xml(link)
                    changed_names.add(name)
                    container.dirty(name)
        existing = {
            container.href_to_name(l.get('href'), name)
            for l in lp(root) if (l.get('type', mtype) or mtype) == mtype
        }
        extra = snames - existing
        if extra:
            changed_names.add(name)
            try:
                parent = hp(root)[0]
            except (TypeError, IndexError):
                parent = root.makeelement(XHTML('head'))
                container.insert_into_xml(root, parent, index=0)
            for sheet in sheets:
                if sheet in extra:
                    container.insert_into_xml(
                        parent,
                        parent.makeelement(XHTML('link'),
                                           rel='stylesheet',
                                           type=mtype,
                                           href=container.name_to_href(
                                               sheet, name)))
            container.dirty(name)

    return changed_names

Example #6

0

Show file

File: input.py Project: keshavbhatt/ebook-converter

    def epubify_markup(self, root, log):
        from ebook_converter.ebooks.oeb.base import XPath, XHTML
        # Fix empty title tags
        for t in XPath('//h:title')(root):
            if not t.text:
                t.text = u' '
        # Fix <p><div> constructs as the asinine epubchecker complains
        # about them
        pdiv = XPath('//h:p/h:div')
        for div in pdiv(root):
            div.getparent().tag = XHTML('div')

        # Remove the position:relative as it causes problems with some epub
        # renderers. Remove display: block on an image inside a div as it is
        # redundant and prevents text-align:center from working in ADE
        # Also ensure that the img is contained in its containing div
        imgpath = XPath('//h:div/h:img[@style]')
        for img in imgpath(root):
            div = img.getparent()
            if len(div) == 1:
                style = div.attrib.get('style', '')
                if style and not style.endswith(';'):
                    style = style + ';'
                style += 'position:static'  # Ensures position of containing div is static
                # Ensure that the img is always contained in its frame
                div.attrib['style'] = style
                img.attrib['style'] = 'max-width: 100%; max-height: 100%'

        # Handle anchored images. The default markup + CSS produced by
        # odf2xhtml works with WebKit but not with ADE. So we convert the
        # common cases of left/right/center aligned block images to work on
        # both webkit and ADE. We detect the case of setting the side margins
        # to auto and map it to an appropriate text-align directive, which
        # works in both WebKit and ADE.
        # https://bugs.launchpad.net/bugs/1063207
        # https://bugs.launchpad.net/calibre/+bug/859343
        imgpath = XPath('descendant::h:div/h:div/h:img')
        for img in imgpath(root):
            div2 = img.getparent()
            div1 = div2.getparent()
            if (len(div1), len(div2)) != (1, 1):
                continue
            cls = div1.get('class', '')
            first_rules = list(
                filter(None, [self.get_css_for_class(x) for x in cls.split()]))
            has_align = False
            for r in first_rules:
                if r.style.getProperty(u'text-align') is not None:
                    has_align = True
            ml = mr = None
            if not has_align:
                aval = None
                cls = div2.get(u'class', u'')
                rules = list(
                    filter(None,
                           [self.get_css_for_class(x) for x in cls.split()]))
                for r in rules:
                    ml = r.style.getPropertyCSSValue(u'margin-left') or ml
                    mr = r.style.getPropertyCSSValue(u'margin-right') or mr
                    ml = getattr(ml, 'value', None)
                    mr = getattr(mr, 'value', None)
                if ml == mr == u'auto':
                    aval = u'center'
                elif ml == u'auto' and mr != u'auto':
                    aval = 'right'
                elif ml != u'auto' and mr == u'auto':
                    aval = 'left'
                if aval is not None:
                    style = div1.attrib.get('style', '').strip()
                    if style and not style.endswith(';'):
                        style = style + ';'
                    style += 'text-align:%s' % aval
                    has_align = True
                    div1.attrib['style'] = style

            if has_align:
                # This is needed for ADE, without it the text-align has no
                # effect
                style = div2.attrib['style']
                div2.attrib['style'] = 'display:inline;' + style