Python parse_html_string Examples, ebooklib.utils.parse_html_string Python Examples

Example #1

0

Show file

    def filter_chapter(self, html, new_css_item):
        '''每一章都是一个html
        '''
        #给每一个html文件添加css
        rel_css_dir = os.path.relpath('.', os.path.dirname(html.get_name()))
        rel_css_file_name=os.path.join(rel_css_dir, self.new_css_filename)
        html.add_item(epub.EpubItem(file_name=rel_css_file_name, media_type='text/css'))

        rel_image_dir=os.path.relpath(self.font_image_dir, os.path.dirname(html.get_name()))
        #生成html对应的links
        self.init_links_of_html(html)
        #只处理body里面正文处理       
        html_tree = parse_html_string(html.get_body_content())
        root=html_tree.getroottree()
        build_text_list = etree.XPath("//text()")
        text_list=build_text_list(root)
        for text in text_list:
            #找出一段文字中生僻字的位置
            pos_list = self.find_uncommon_words_in_one_text(text)
            self.add_image_tag_for_uncommon_words_in_one_text(text, pos_list,rel_image_dir)
        #将root更新到html的content中，不然的话，不会保存
        ori_root=parse_html_string(html.content)
        #删除旧的body
        body=ori_root.find('body')
        ori_root.remove(body)
        #新的body
        ori_root.append(root.find('body'))
        html.content = etree.tostring(ori_root, pretty_print=True, encoding='utf-8', xml_declaration=True)

Example #2

0

Show file

File: readerplugins.py Project: sourcefabric/Booktype

    def html_after_read(self, book, chapter):
        try:
            tree = parse_html_string(chapter.content)
        except:
            return

        root = tree.getroottree()

        if len(root.find("head")) != 0:
            head = tree.find("head")
            title = head.find("title")

            if title is not None:
                chapter.title = title.text

        if len(root.find("body")) != 0:
            body = tree.find("body")

            # todo:
            # - fix <a href="">
            # - fix ....

            for _item in body.iter():
                for t in self.remove_attributes:
                    if t in _item.attrib:
                        del _item.attrib[t]

        chapter.content = etree.tostring(tree, pretty_print=True, encoding="utf-8", xml_declaration=True)

Example #3

0

Show file

File: epub.py Project: homaralex/ebooklib

    def _parse_nav(self, data, base_path):
        html_node = parse_html_string(data)
        nav_node = html_node.xpath("//nav[@*='toc']")[0]

        def parse_list(list_node):
            items = []

            for item_node in list_node.findall("li"):

                sublist_node = item_node.find("ol")
                link_node = item_node.find("a")

                if sublist_node is not None:
                    title = item_node[0].text
                    children = parse_list(sublist_node)

                    if link_node is not None:
                        href = zip_path.normpath(zip_path.join(base_path, link_node.get("href")))
                        items.append((Section(title, href=href), children))
                    else:
                        items.append((Section(title), children))
                elif link_node is not None:
                    title = link_node.text
                    href = zip_path.normpath(zip_path.join(base_path, link_node.get("href")))

                    items.append(Link(href, title))

            return items

        self.book.toc = parse_list(nav_node.find("ol"))

Example #4

0

Show file

File: epub.py Project: homaralex/ebooklib

    def get_body_content(self):
        """
        Returns content of BODY element for this HTML document. Content will be of type 'str' (Python 2) or 'bytes' (Python 3).

        :Returns:
          Returns content of this document.
        """
        content = self.get_content()

        try:
            html_tree = parse_html_string(self.content)
        except:
            return ''

        html_root = html_tree.getroottree()

        if len(html_root.find('body')) != 0:
            body = html_tree.find('body')

            tree_str = etree.tostring(body, pretty_print=True, encoding='utf-8', xml_declaration=False)

            # this is so stupid
            if tree_str.startswith(six.b('<body>')):
                n = tree_str.rindex(six.b('</body>'))

                return tree_str[7:n]

            return tree_str

        return ''

Example #5

0

Show file

    def get_body_content(self):
        """
        Returns content of BODY element for this HTML document. Content will be of type 'str' (Python 2) or 'bytes' (Python 3).

        :Returns:
          Returns content of this document.
        """

        try:
            html_tree = parse_html_string(self.content)
        except:
            return ''

        html_root = html_tree.getroottree()

        if len(html_root.find('body')) != 0:
            body = html_tree.find('body')

            tree_str = etree.tostring(body, pretty_print=True, encoding='utf-8', xml_declaration=False)

            # this is so stupid
            if tree_str.startswith(six.b('<body>')):
                n = tree_str.rindex(six.b('</body>'))

                return tree_str[7:n]

            return tree_str

        return ''

Example #6

0

Show file

File: epub.py Project: leleu/ebooklib

    def get_body_content(self):
        content = self.get_content()

        try:
            html_tree = parse_html_string(self.content)
        except:
            return ''

        html_root = html_tree.getroottree()

        if len(html_root.find('body')) != 0:
            body = html_tree.find('body')

            tree_str = etree.tostring(body,
                                      pretty_print=True,
                                      encoding='utf-8',
                                      xml_declaration=False)
            # this is so stupid
            if tree_str.startswith('<body>'):
                n = tree_str.rindex('</body>')

                return tree_str[7:n]

            return tree_str

        return ''

Example #7

0

Show file

File: booktype_tags.py Project: danielhjames/Booktype

    def _reformat_endnotes(content):

        try:
            tree = parse_html_string(content.encode('utf-8'))
        except Exception as err:
            logger.error('Error parsing chapter content {err}'.format(err=err))
            return content

        for elem in tree.iter():
            # remove endnotes without reference
            if elem.tag == 'ol' and elem.get('class') == 'endnotes':
                for li in elem.xpath("//li[@class='orphan-endnote']"):
                    li.drop_tree()

            # insert internal link to endnote's body into the sup
            elif elem.tag == 'sup' and elem.get('data-id'):
                a = etree.Element("a")
                a.set('href', '#endnote-{0}'.format(elem.get('data-id')))
                a.text = elem.text
                elem.text = ''
                elem.insert(0, a)

        content = etree.tostring(tree, method='html', encoding='utf-8', xml_declaration=False)
        content = content.replace('<html><body>', '').replace('</body></html>', '')

        return content

Example #8

0

Show file

def remove_unknown_tags(html_content):
    """
    Remove unknown tags from a given html content string.
    This method is based on a method of Cleaner class on lxml.html module
    """

    from lxml.html import defs

    try:
        tree = parse_html_string(html_content)
    except Exception as err:
        logger.error(
            "RemoveUnknownTags: Problem while trying to parse content %s" %
            err)

    allow_tags = set(defs.tags)

    if allow_tags:
        bad = []
        for el in tree.iter():
            if el.tag not in allow_tags:
                bad.append(el)
        if bad:
            if bad[0] is tree:
                el = bad.pop(0)
                el.tag = 'div'
                el.attrib.clear()
            for el in bad:
                el.drop_tag()

    return etree.tostring(tree,
                          pretty_print=True,
                          encoding='utf-8',
                          xml_declaration=True)

Example #9

0

Show file

    def _reformat_endnotes(content):

        try:
            tree = parse_html_string(content.encode('utf-8'))
        except Exception as err:
            logger.error('Error parsing chapter content {err}'.format(err=err))
            return content

        for elem in tree.iter():
            # remove endnotes without reference
            if elem.tag == 'ol' and elem.get('class') == 'endnotes':
                for li in elem.xpath("//li[@class='orphan-endnote']"):
                    li.drop_tree()

            # insert internal link to endnote's body into the sup
            elif elem.tag == 'sup' and elem.get('data-id'):
                a = etree.Element("a")
                a.set('href', '#endnote-{0}'.format(elem.get('data-id')))
                a.text = elem.text
                elem.text = ''
                elem.insert(0, a)

        content = etree.tostring(tree, method='html', encoding='utf-8', xml_declaration=False)
        content = content.replace('<html><body>', '').replace('</body></html>', '')

        return content

Example #10

0

Show file

File: epub.py Project: 57uff3r/ebooklib

    def get_body_content(self):
        content = self.get_content()

        try:
            html_tree = parse_html_string(self.content)
        except:
            return ''

        html_root = html_tree.getroottree()

        if len(html_root.find('body')) != 0:
            body = html_tree.find('body')

            if sys.version_info >= (3, 0):
                tree_str = etree.tostring(body, encoding='unicode')
            else:
                tree_str = etree.tostring(body, pretty_print=True, encoding='utf-8', xml_declaration=False)

            # this is so stupid
            if tree_str.startswith('<body>'):
                n = tree_str.rindex('</body>')

                return tree_str[7:n]

            return tree_str

        return ''

Example #11

0

Show file

File: misc.py Project: kronoscode/Booktype

    def html_after_read(self, book, chapter):
        try:
            tree = parse_html_string(chapter.content)
        except:
            return

        root = tree.getroottree()

        if len(root.find('head')) != 0:
            head = tree.find('head')
            title = head.find('title')

            if title is not None:
                chapter.title = title.text

        if len(root.find('body')) != 0:
            body = tree.find('body')

            # todo:
            # - fix <a href="">
            # - fix ....

            for _item in body.iter():
                if _item.tag == 'img':
                    _name = _item.get('src')
                    # this is not a good check
                    if _name and not _name.lower().startswith('http'):
                        _item.set('src', 'static/%s' % _convert_file_name(_name))

                for t in self.remove_attributes:
                    if t in _item.attrib:
                        del _item.attrib[t]

        chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True)

Example #12

0

Show file

File: misc.py Project: kronoscode/Booktype

def remove_unknown_tags(html_content):
    """
    Remove unknown tags from a given html content string.
    This method is based on a method of Cleaner class on lxml.html module
    """

    from lxml.html import defs

    try:
        tree = parse_html_string(html_content)
    except Exception as err:
        logger.error("RemoveUnknownTags: Problem while trying to parse content %s" % err)

    allow_tags = set(defs.tags)

    if allow_tags:
        bad = []
        for el in tree.iter():
            if el.tag not in allow_tags:
                bad.append(el)
        if bad:
            if bad[0] is tree:
                el = bad.pop(0)
                el.tag = 'div'
                el.attrib.clear()
            for el in bad:
                el.drop_tag()

    return etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True)

Example #13

0

Show file

    def html_after_read(self, book, chapter):
        if not chapter.is_chapter():
            return

        from lxml import etree
        from ebooklib.utils import parse_html_string

        try:
            tree = parse_html_string(chapter.content)
        except:
            return

        root = tree.getroottree()

        if len(root.find('head')) != 0:
            head = tree.find('head')
            title = head.find('title')

            if title is not None:
                chapter.title = title.text

        chapter.content = etree.tostring(tree,
                                         pretty_print=True,
                                         encoding='utf-8',
                                         xml_declaration=True)

Example #14

0

Show file

    def html_after_read(self, book, chapter):
        try:
            tree = parse_html_string(chapter.content)
        except:
            return

        root = tree.getroottree()

        if len(root.find('head')) != 0:
            head = tree.find('head')
            title = head.find('title')

            if title is not None:
                chapter.title = title.text

        if len(root.find('body')) != 0:
            body = tree.find('body')

            # todo:
            # - fix <a href="">
            # - fix ....

            for _item in body.iter():
                for t in self.remove_attributes:
                    if t in _item.attrib:
                        del _item.attrib[t]

        chapter.content = etree.tostring(tree,
                                         pretty_print=True,
                                         encoding='utf-8',
                                         xml_declaration=True)

Example #15

0

Show file

    def _parse_nav(self, data, base_path):
        html_node = parse_html_string(data)
        nav_node = html_node.xpath("//nav[@*='toc']")[0]

        def parse_list(list_node):
            items = []

            for item_node in list_node.findall("li"):

                sublist_node = item_node.find("ol")
                link_node    = item_node.find("a")

                if sublist_node is not None:
                    title    = item_node[0].text
                    children = parse_list(sublist_node)

                    items.append((Section(title), children))

                elif link_node is not None:
                    title = link_node.text
                    href  = os.path.normpath(os.path.join(base_path, link_node.get("href")))

                    items.append(Link(href, title))

            return items

        self.book.toc = parse_list(nav_node.find("ol"))

Example #16

0

Show file

    def _create_toc(self):
        """
        Create table of contents

        :Args:
          - self (:class:`ExportBook`): current class instance
        """

        self.toc = OrderedDict()
        self.spine = ['nav']

        self.hold_chapters_urls = [i.url_title for i in self.book_version.get_hold_chapters()]

        for chapter in self.book_version.get_toc():
            if chapter.chapter:
                c1 = epub.EpubHtml(
                    title=chapter.chapter.title,
                    file_name='%s.xhtml' % (chapter.chapter.url_title, )
                )

                # hook for some extra customizations
                cont = self._chapter_content_hook(chapter.chapter.content)

                try:
                    tree = parse_html_string(cont.encode('utf-8'))
                except Exception as err:
                    logger.error('Error parsing chapter content %s' % err)
                    continue

                # hook for some extra customizations
                self._chapter_tree_hook(tree)

                for elem in tree.iter():
                    self._handle_chapter_element(elem)

                c1.content = etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True)

                # hook for some extra customizations
                self._epub_chapter_hook(c1)

                self.epub_book.add_item(c1)
                self.spine.append(c1)

                if chapter.parent:
                    self.toc[chapter.parent.id][1].append(c1)
                else:
                    if chapter.has_children():
                        self.toc[chapter.id] = [c1, []]
                    else:
                        self.toc[chapter.id] = c1
            else:
                epub_sec = epub.Section(chapter.name)

                if chapter.parent:
                    self.toc[chapter.parent.id][1].append(epub_sec)
                else:
                    self.toc[chapter.id] = [epub_sec, []]

Example #17

0

Show file

File: epub.py Project: leleu/ebooklib

    def get_content(self, default=None):
        tree = parse_string(self.book.get_template(self._template_name))
        tree_root = tree.getroot()

        tree_root.set('lang', self.lang or self.book.language)
        tree_root.attrib['{%s}lang' %
                         NAMESPACES['XML']] = self.lang or self.book.language

        # add to the head also
        #  <meta charset="utf-8" />

        try:
            html_tree = parse_html_string(self.content)
        except:
            return ''

        html_root = html_tree.getroottree()

        # create and populate head

        _head = etree.SubElement(tree_root, 'head')

        if self.title != '':
            _title = etree.SubElement(_head, 'title')
            _title.text = self.title

        for lnk in self.links:
            if lnk.get("type") == "text/javascript":
                _lnk = etree.SubElement(_head, 'script', lnk)
                # force <script></script>
                _lnk.text = ''
            else:
                _lnk = etree.SubElement(_head, 'link', lnk)

        # this should not be like this
        # head = html_root.find('head')
        # if head is not None:
        #     for i in head.getchildren():
        #         if i.tag == 'title' and self.title != '':
        #             continue
        #         _head.append(i)

        # create and populate body

        _body = etree.SubElement(tree_root, 'body')

        body = html_tree.find('body')
        if body is not None:
            for i in body.getchildren():
                _body.append(i)

        tree_str = etree.tostring(tree,
                                  pretty_print=True,
                                  encoding='utf-8',
                                  xml_declaration=True)

        return tree_str

Example #18

0

Show file

File: epub.py Project: chazzam/ebooklib

    def get_content(self, default=None):
        tree = parse_string(self.book.get_template(self._template_name))
        tree_root = tree.getroot()

        tree_root.set("lang", self.lang or self.book.language)
        tree_root.attrib["{%s}lang" % NAMESPACES["XML"]] = self.lang or self.book.language

        # add to the head also
        #  <meta charset="utf-8" />

        try:
            html_tree = parse_html_string(self.content)
        except:
            return ""

        html_root = html_tree.getroottree()

        # create and populate head

        _head = etree.SubElement(tree_root, "head")

        if self.title != "":
            _title = etree.SubElement(_head, "title")
            _title.text = self.title

        if hasattr(self, "img_width") and hasattr(self, "img_height"):
            opts = {"name": "viewport", "content": "width=%d, height=%d" % (self.img_width, self.img_height)}
            _meta = etree.SubElement(_head, "meta", opts)

        for lnk in self.links:
            if lnk.get("type") == "text/javascript":
                _lnk = etree.SubElement(_head, "script", lnk)
                # force <script></script>
                _lnk.text = ""
            else:
                _lnk = etree.SubElement(_head, "link", lnk)

        # this should not be like this
        # head = html_root.find('head')
        # if head is not None:
        #     for i in head.getchildren():
        #         if i.tag == 'title' and self.title != '':
        #             continue
        #         _head.append(i)

        # create and populate body

        _body = etree.SubElement(tree_root, "body")

        body = html_tree.find("body")
        if body is not None:
            for i in body.getchildren():
                _body.append(i)

        tree_str = etree.tostring(tree, pretty_print=True, encoding="utf-8", xml_declaration=True)

        return tree_str

Example #19

0

Show file

    def _create_toc(self):
        """
        Create table of contents

        :Args:
          - self (:class:`ExportBook`): current class instance
        """

        self.toc = OrderedDict()
        self.spine = ['nav']

        self.hold_chapters_urls = [i.url_title for i in self.book_version.get_hold_chapters()]

        for chapter in self.book_version.get_toc():
            if chapter.chapter:
                c1 = epub.EpubHtml(
                    title=chapter.chapter.title,
                    file_name='%s.xhtml' % (chapter.chapter.url_title, )
                )

                # hook for some extra customizations
                cont = self._chapter_content_hook(chapter.chapter.content)

                try:
                    tree = parse_html_string(cont.encode('utf-8'))
                except Exception as err:
                    logger.error('Error parsing chapter content %s' % err)
                    continue

                # hook for some extra customizations
                self._chapter_tree_hook(tree)

                for elem in tree.iter():
                    self._handle_chapter_element(elem)

                c1.content = etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True)

                # hook for some extra customizations
                self._epub_chapter_hook(c1)

                self.epub_book.add_item(c1)
                self.spine.append(c1)

                if chapter.parent:
                    self.toc[chapter.parent.id][1].append(c1)
                else:
                    if chapter.has_children():
                        self.toc[chapter.id] = [c1, []]
                    else:
                        self.toc[chapter.id] = c1
            else:
                epub_sec = epub.Section(chapter.name)

                if chapter.parent:
                    self.toc[chapter.parent.id][1].append(epub_sec)
                else:
                    self.toc[chapter.id] = [epub_sec, []]

Example #20

0

Show file

File: comments.py Project: MarsWan/Booktype

    def html_before_write(self, book, chapter):
        if not chapter.content:
            return None

        tree = parse_html_string(chapter.get_content())

        # remove comments reference bubble from the chapter content
        for commentsBubble in tree.xpath(".//a[@class='comment-link']"):
            commentsBubble.drop_tree()

        chapter.content = etree.tostring(
            tree, pretty_print=True, encoding='utf-8', xml_declaration=True)

Example #21

0

Show file

File: epub.py Project: sdfdsv/BilingualBooks

    def get_content(self, default=None):
        tree = parse_string(self.book.get_template(self._template_name))
        tree_root = tree.getroot()

        tree_root.set('lang', self.lang or self.book.language)
        tree_root.attrib['{%s}lang' % NAMESPACES['XML']] = self.lang or self.book.language

        # add to the head also
        #  <meta charset="utf-8" />

        try:
            html_tree = parse_html_string(self.content)
        except:
            return ''

        html_root = html_tree.getroottree()

        # create and populate head

        _head = etree.SubElement(tree_root, 'head')

        if self.title != '':
            _title = etree.SubElement(_head, 'title')
            _title.text = self.title

        for lnk in self.links:
            if lnk.get("type") == "text/javascript":
                _lnk = etree.SubElement(_head, 'script', lnk)
                # force <script></script>
                _lnk.text = ''
            else:
                _lnk = etree.SubElement(_head, 'link', lnk)

        # this should not be like this
        head = html_root.find('head')
        if head is not None:
            for i in head.getchildren():
                if i.tag == 'title' and self.title != '':
                    continue
                _head.append(i)

        # create and populate body

        _body = etree.SubElement(tree_root, 'body')

        body = html_tree.find('body')
        if body is not None:
            for i in body.getchildren():
                _body.append(i)

        tree_str = etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True)

        return tree_str

Example #22

0

Show file

File: booktype.py Project: SoldierGamma/EWR

    def html_before_write(self, book, chapter):
        from lxml import etree

        try:
            from urlparse import urlparse, urljoin
        except ImportError:
            from urllib.parse import urlparse, urljoin

        try:
            tree = parse_html_string(chapter.content)
        except:
            return

        root = tree.getroottree()

        if len(root.find('body')) != 0:
            body = tree.find('body')

            # should also be aware to handle
            # ../chapter/
            # ../chapter/#reference
            # ../chapter#reference

            for _link in body.xpath('//a'):
                # This is just temporary for the footnotes
                if _link.get('href', '').find('InsertNoteID') != -1:
                    _ln = _link.get('href', '')
                    i = _ln.find('#')
                    _link.set('href', _ln[i:])

                    continue

                _u = urlparse(_link.get('href', ''))

                # Let us care only for internal links at the moment
                if _u.scheme == '':
                    if _u.path != '':
                        _link.set('href', '%s.xhtml' % _u.path)

                    if _u.fragment != '':
                        _link.set(
                            'href',
                            urljoin(_link.get('href'), '#%s' % _u.fragment))

                    if _link.get('name') != None:
                        _link.set('id', _link.get('name'))
                        etree.strip_attributes(_link, 'name')

        chapter.content = etree.tostring(tree,
                                         pretty_print=True,
                                         encoding='utf-8')

Example #23

0

Show file

File: comments.py Project: zeuser/Booktype

    def html_before_write(self, book, chapter):
        if not chapter.content:
            return None

        tree = parse_html_string(chapter.get_content())

        # remove comments reference bubble from the chapter content
        for commentsBubble in tree.xpath(".//a[@class='comment-link']"):
            commentsBubble.drop_tree()

        chapter.content = etree.tostring(tree,
                                         pretty_print=True,
                                         encoding='utf-8',
                                         xml_declaration=True)

Example #24

0

Show file

    def html_before_write(self, book, chapter):
        from lxml import etree, html

        from pygments import highlight
        from pygments.formatters import HtmlFormatter

        from ebooklib import epub

        try:
            tree = parse_html_string(chapter.content)
        except:
            return

        root = tree.getroottree()

        had_source = False

        if len(root.find('body')) != 0:
            body = tree.find('body')
            # check for embeded source
            for source in body.xpath('//pre[contains(@class,"source-")]'):
                css_class = source.get('class')

                source_text = (source.text or '') + ''.join(
                    [html.tostring(child) for child in source.iterchildren()])

                if 'source-python' in css_class:
                    from pygments.lexers import PythonLexer

                    #                    _text =  highlight(source_text, PythonLexer(), HtmlFormatter(linenos="inline"))
                    _text = highlight(source_text, PythonLexer(),
                                      HtmlFormatter())

                if 'source-css' in css_class:
                    from pygments.lexers import CssLexer

                    _text = highlight(source_text, CssLexer(), HtmlFormatter())

                _parent = source.getparent()
                _parent.replace(source, etree.XML(_text))

                had_source = True

        if had_source:
            chapter.add_link(href="style/code.css",
                             rel="stylesheet",
                             type="text/css")
            chapter.content = etree.tostring(tree,
                                             pretty_print=True,
                                             encoding='utf-8')

Example #25

0

Show file

File: content_cleanup_plugin.py Project: xanjay/Booktype

    def html_before_write(self, book, chapter):
        if chapter.get_type() != ebooklib.ITEM_DOCUMENT or isinstance(
                chapter, ebooklib.epub.EpubNav):
            return True

        tags_allowed_to_be_empty = config.get_configuration(
            'ALLOWED_EMPTY_TAGS')
        tags_to_remove_on_cleanup = config.get_configuration(
            'TAGS_TO_REMOVE_ON_CLEANUP')
        attrs_to_remove_on_cleanup = config.get_configuration(
            'ATTRS_TO_REMOVE_ON_CLEANUP')
        allowed_empty_by_classes = config.get_configuration(
            'ALLOWED_EMPTY_BY_CLASSES')

        root = parse_html_string(chapter.get_content())

        # let's remove all the tags we don't want to have on export
        # this will affect all the converters since they use the generated
        # epub as base for converting process
        for tag in tags_to_remove_on_cleanup:
            for node in root.iter(tag):
                node.drop_tree()

        # walk over all elements in the tree and remove all
        # nodes that are recursively empty
        body = root.find('body')

        for elem in body.xpath("//body//*"):
            # remove not wanted attributes
            for attr in attrs_to_remove_on_cleanup:
                if attr in elem.attrib:
                    del elem.attrib[attr]

            klasses = elem.get('class', '').split()
            allowed_by_class = any(
                [x in allowed_empty_by_classes for x in klasses])

            if recursively_empty(
                    elem
            ) and elem.tag not in tags_allowed_to_be_empty and not allowed_by_class:
                # just in case if text contains spaces or tabs, because drop_tag removes only tag
                elem.text = ''
                elem.drop_tag()

        chapter.content = etree.tostring(root,
                                         pretty_print=True,
                                         encoding="utf-8",
                                         xml_declaration=True)

        return True

Example #26

0

Show file

def ice_cleanup(content, **kwargs):
    tree = parse_html_string(content)

    # remove tags of deletes-tracked changes
    spans_with_deletes = tree.xpath("//%(tag)s[contains(@class, '%(insert_class)s')]" % kwargs)
    for span in spans_with_deletes:
        span.drop_tree()

    # remove tag, but keep content of inserted changes
    spans_with_inserts = tree.xpath("//%(tag)s[contains(@class, '%(delete_class)s')]" % kwargs)
    for span in spans_with_inserts:
        span.drop_tag()

    return tree

Example #27

0

Show file

File: booktype.py Project: 171230839/ebooklib

    def html_before_write(self, book, chapter):
        from lxml import  etree

        try:
            from urlparse import urlparse, urljoin
        except ImportError:
            from urllib.parse import urlparse, urljoin

        try:
            tree = parse_html_string(chapter.content)
        except:
            return

        root = tree.getroottree()

        if len(root.find('body')) != 0:
            body = tree.find('body')

            # should also be aware to handle
            # ../chapter/
            # ../chapter/#reference
            # ../chapter#reference

            for _link in body.xpath('//a'):
                # This is just temporary for the footnotes
                if _link.get('href', '').find('InsertNoteID') != -1:
                    _ln = _link.get('href', '')
                    i = _ln.find('#')                                       
                    _link.set('href', _ln[i:]);

                    continue

                _u = urlparse(_link.get('href', ''))

                # Let us care only for internal links at the moment
                if _u.scheme == '':
                    if _u.path != '':
                        _link.set('href', '%s.xhtml' % _u.path)
                    
                    if _u.fragment != '':
                        _link.set('href', urljoin(_link.get('href'), '#%s' % _u.fragment))

                    if _link.get('name') != None:
                        _link.set('id', _link.get('name'))
                        etree.strip_attributes(_link, 'name')
                    
        chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8')

Example #28

0

Show file

File: readablepub.py Project: nicolov/readablepub

    def html_before_write(self, book, chapter):
        try:
            html_tree = parse_html_string(chapter.content)
        except:
            return

        for img_elem in html_tree.iterfind(".//img"):
            href = img_elem.attrib["src"]
            split_href = os.path.splitext(img_elem.attrib["src"])
            # We can just slugify the original URL to determine the new URL
            img_local_filename = slugify(split_href[0]) + split_href[1]
            book.add_item(
                epub.EpubItem(uid=img_local_filename, file_name=img_local_filename, content=requests.get(href).content)
            )
            # Alter the HTML element to point at the local resource
            img_elem.attrib["src"] = img_local_filename

        chapter.content = etree.tostring(html_tree, pretty_print=True, encoding="utf-8")

Example #29

0

Show file

File: sourcecode.py Project: 171230839/ebooklib

    def html_before_write(self, book, chapter):
        from lxml import etree, html

        from pygments import highlight
        from pygments.formatters import HtmlFormatter

        from ebooklib import epub

        try:
            tree = parse_html_string(chapter.content)
        except:
            return

        root = tree.getroottree()

        had_source = False

        if len(root.find('body')) != 0:
            body = tree.find('body')
            # check for embeded source
            for source in body.xpath('//pre[contains(@class,"source-")]'):
                css_class = source.get('class')

                source_text = (source.text or '') + ''.join([html.tostring(child) for child in source.iterchildren()])

                if 'source-python' in css_class:
                    from pygments.lexers import PythonLexer

#                    _text =  highlight(source_text, PythonLexer(), HtmlFormatter(linenos="inline"))
                    _text =  highlight(source_text, PythonLexer(), HtmlFormatter())

                if 'source-css' in css_class:
                    from pygments.lexers import CssLexer

                    _text =  highlight(source_text, CssLexer(), HtmlFormatter())

                _parent = source.getparent()
                _parent.replace(source, etree.XML(_text))

                had_source = True

        if had_source:
            chapter.add_link(href="style/code.css", rel="stylesheet", type="text/css")
            chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8')

Example #30

0

Show file

File: misc.py Project: kronoscode/Booktype

    def html_after_read(self, book, chapter):
        if not chapter.is_chapter():
            return

        try:
            tree = parse_html_string(chapter.content)
        except:
            return

        root = tree.getroottree()

        if len(root.find('head')) != 0:
            head = tree.find('head')
            title = head.find('title')

            if title is not None:
                chapter.title = title.text

        chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True)

Example #31

0

Show file

    def html_before_write(self, book, chapter):
        try:
            html_tree = parse_html_string(chapter.content)
        except:
            return

        for img_elem in html_tree.iterfind('.//img'):
            href = img_elem.attrib['src']
            split_href = os.path.splitext(img_elem.attrib['src'])
            # We can just slugify the original URL to determine the new URL
            img_local_filename = slugify(split_href[0]) + split_href[1]
            book.add_item(
                epub.EpubItem(uid=img_local_filename,
                              file_name=img_local_filename,
                              content=requests.get(href).content))
            # Alter the HTML element to point at the local resource
            img_elem.attrib['src'] = img_local_filename

        chapter.content = etree.tostring(html_tree,
                                         pretty_print=True,
                                         encoding='utf-8')

Example #32

0

Show file

File: content_cleanup_plugin.py Project: danielhjames/Booktype

    def html_before_write(self, book, chapter):
        if chapter.get_type() != ebooklib.ITEM_DOCUMENT or isinstance(chapter, ebooklib.epub.EpubNav):
            return True

        tags_allowed_to_be_empty = config.get_configuration('ALLOWED_EMPTY_TAGS')
        tags_to_remove_on_cleanup = config.get_configuration('TAGS_TO_REMOVE_ON_CLEANUP')
        attrs_to_remove_on_cleanup = config.get_configuration('ATTRS_TO_REMOVE_ON_CLEANUP')
        allowed_empty_by_classes = config.get_configuration('ALLOWED_EMPTY_BY_CLASSES')

        root = parse_html_string(chapter.get_content())

        # let's remove all the tags we don't want to have on export
        # this will affect all the converters since they use the generated
        # epub as base for converting process
        for tag in tags_to_remove_on_cleanup:
            for node in root.iter(tag):
                node.drop_tree()

        # walk over all elements in the tree and remove all
        # nodes that are recursively empty
        body = root.find('body')

        for elem in body.xpath("//body//*"):
            # remove not wanted attributes
            for attr in attrs_to_remove_on_cleanup:
                if attr in elem.attrib:
                    del elem.attrib[attr]

            klasses = elem.get('class', '').split()
            allowed_by_class = any([x in allowed_empty_by_classes for x in klasses])

            if recursively_empty(elem) and elem.tag not in tags_allowed_to_be_empty and not allowed_by_class:
                # just in case if text contains spaces or tabs, because drop_tag removes only tag
                elem.text = ''
                elem.drop_tag()

        chapter.content = etree.tostring(root, pretty_print=True, encoding="utf-8", xml_declaration=True)

        return True

Example #33

0

Show file

File: icejs.py Project: danielhjames/Booktype

def ice_cleanup(content, **kwargs):
    """
    This method removes "inserted" content and remove tags of "deleted" changes
    of the tracking engine trail. For example:

    <span class="ins">content and tag will be deleted</span> -> cause means it's not approved yet.
    <span class="del">content will be kept and tag removed</span> -> cause is previous content state.
    """

    tree = parse_html_string(content)

    # remove tags and content of inserted changes (not approved)
    spans_with_inserts = tree.xpath("//%(tag)s[contains(@class, '%(insert_class)s')]" % kwargs)
    for span in spans_with_inserts:
        span.drop_tree()

    # remove tag, but keep content of deleted changes
    spans_with_deletes = tree.xpath("//%(tag)s[contains(@class, '%(delete_class)s')]" % kwargs)
    for span in spans_with_deletes:
        span.drop_tag()

    return tree

Example #34

0

Show file

File: booktype.py Project: SoldierGamma/EWR

    def html_before_write(self, book, chapter):
        from lxml import etree

        from ebooklib import epub

        try:
            tree = parse_html_string(chapter.content)
        except:
            return

        root = tree.getroottree()

        if len(root.find('body')) != 0:
            body = tree.find('body')

            # <span id="InsertNoteID_1_marker1" class="InsertNoteMarker"><sup><a href="#InsertNoteID_1">1</a></sup><span>
            # <ol id="InsertNote_NoteList"><li id="InsertNoteID_1">prvi footnote <span id="InsertNoteID_1_LinkBacks"><sup><a href="#InsertNoteID_1_marker1">^</a></sup></span></li>

            # <a epub:type="noteref" href="#n1">1</a></p>
            # <aside epub:type="footnote" id="n1"><p>These have been corrected in this EPUB3 edition.</p></aside>
            for footnote in body.xpath('//span[@class="InsertNoteMarker"]'):
                footnote_id = footnote.get('id')[:-8]
                a = footnote.getchildren()[0].getchildren()[0]

                footnote_text = body.xpath('//li[@id="%s"]' % footnote_id)[0]

                a.attrib['{%s}type' % epub.NAMESPACES['EPUB']] = 'noteref'
                ftn = etree.SubElement(body, 'aside', {'id': footnote_id})
                ftn.attrib['{%s}type' % epub.NAMESPACES['EPUB']] = 'footnote'
                ftn_p = etree.SubElement(ftn, 'p')
                ftn_p.text = footnote_text.text

            old_footnote = body.xpath('//ol[@id="InsertNote_NoteList"]')
            if len(old_footnote) > 0:
                body.remove(old_footnote[0])

        chapter.content = etree.tostring(tree,
                                         pretty_print=True,
                                         encoding='utf-8')

Example #35

0

Show file

    def html_after_read(self, book, chapter):
        try:
            tree = parse_html_string(chapter.content)
        except:
            return

        root = tree.getroottree()

        if len(root.find('head')) != 0:
            head = tree.find('head')
            title = head.find('title')

            if title is not None:
                chapter.title = title.text

        if len(root.find('body')) != 0:
            body = tree.find('body')

            # todo:
            # - fix <a href="">
            # - fix ....

            for _item in body.iter():
                if _item.tag == 'img':
                    _name = _item.get('src')
                    # this is not a good check
                    if _name and not _name.lower().startswith('http'):
                        _item.set('src',
                                  'static/%s' % _convert_file_name(_name))

                for t in self.remove_attributes:
                    if t in _item.attrib:
                        del _item.attrib[t]

        chapter.content = etree.tostring(tree,
                                         pretty_print=True,
                                         encoding='utf-8',
                                         xml_declaration=True)

Example #36

0

Show file

File: epub.py Project: chazzam/ebooklib

    def get_body_content(self):
        content = self.get_content()

        try:
            html_tree = parse_html_string(self.content)
        except:
            return ""

        html_root = html_tree.getroottree()

        if len(html_root.find("body")) != 0:
            body = html_tree.find("body")

            tree_str = etree.tostring(body, pretty_print=True, encoding="utf-8", xml_declaration=False)
            # this is so stupid
            if tree_str.startswith("<body>"):
                n = tree_str.rindex("</body>")

                return tree_str[7:n]

            return tree_str

        return ""

Example #37

0

Show file

File: booktype.py Project: 171230839/ebooklib

    def html_before_write(self, book, chapter):
        from lxml import etree

        from ebooklib import epub

        try:
            tree = parse_html_string(chapter.content)
        except:
            return

        root = tree.getroottree()

        if len(root.find('body')) != 0:
            body = tree.find('body')

            # <span id="InsertNoteID_1_marker1" class="InsertNoteMarker"><sup><a href="#InsertNoteID_1">1</a></sup><span>
            # <ol id="InsertNote_NoteList"><li id="InsertNoteID_1">prvi footnote <span id="InsertNoteID_1_LinkBacks"><sup><a href="#InsertNoteID_1_marker1">^</a></sup></span></li>

            # <a epub:type="noteref" href="#n1">1</a></p>
            # <aside epub:type="footnote" id="n1"><p>These have been corrected in this EPUB3 edition.</p></aside>
            for footnote in body.xpath('//span[@class="InsertNoteMarker"]'):
                footnote_id = footnote.get('id')[:-8]
                a = footnote.getchildren()[0].getchildren()[0]
                
                footnote_text = body.xpath('//li[@id="%s"]' % footnote_id)[0]
                
                a.attrib['{%s}type' % epub.NAMESPACES['EPUB']] = 'noteref'
                ftn = etree.SubElement(body, 'aside', {'id': footnote_id})
                ftn.attrib['{%s}type' % epub.NAMESPACES['EPUB']] = 'footnote'
                ftn_p = etree.SubElement(ftn, 'p')
                ftn_p.text = footnote_text.text

            old_footnote = body.xpath('//ol[@id="InsertNote_NoteList"]')
            if len(old_footnote) > 0:
                body.remove(old_footnote[0])
            
        chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8')

Example #38

0

Show file

def ice_cleanup(content, **kwargs):
    """
    This method removes "inserted" content and remove tags of "deleted" changes
    of the tracking engine trail. For example:

    <span class="ins">content and tag will be deleted</span> -> cause means it's not approved yet.
    <span class="del">content will be kept and tag removed</span> -> cause is previous content state.
    """

    tree = parse_html_string(content)

    # remove tags and content of inserted changes (not approved)
    spans_with_inserts = tree.xpath(
        "//%(tag)s[contains(@class, '%(insert_class)s')]" % kwargs)
    for span in spans_with_inserts:
        span.drop_tree()

    # remove tag, but keep content of deleted changes
    spans_with_deletes = tree.xpath(
        "//%(tag)s[contains(@class, '%(delete_class)s')]" % kwargs)
    for span in spans_with_deletes:
        span.drop_tag()

    return tree

Example #39

0

Show file

File: views.py Project: sourcefabric/Booktype

def export_book(input_file, filename):
    """Reads content of book in Booki.zip format and converts it to EPUB format.

    This function reads content of the book in Booki.zip file, creates new
    book in EPUB format and converts entire content into it. There are some
    things which are different in new EPUB format. One of them is how links 
    and interlinks are handled.
    """

    epub_book = ExportEpubBook()

    # Creating new EPUB file
    epub_book.add_prefix("bkterms", "http://booktype.org/")

    # Read old Booki.zip format
    bookizip = BookiZip(input_file)

    _toc, _section, _section_name = [], [], None
    spine = ["nav"]

    # Get filesnames of all the chapters/sections
    file_names = [file_name[6:-5] for _, file_name, _ in bookizip.get_toc()]

    x = 0
    for typ, file_name, title in bookizip.get_toc():
        # Ignore sections
        if typ == 1:
            if _section_name is None and len(_section) > 0:
                _toc.append(_section)
            elif len(_section) > 0:
                _toc.append((epub.Section(_section_name), _section[:]))

            _section_name = title
            _section = []
            continue

        # Create new chapter with new filename
        c1 = epub.EpubHtml(title=title, file_name="{}.xhtml".format(file_name[6:-5]))
        cont = unicode(bookizip.read(file_name), "utf-8")
        _section.append(c1)

        try:
            tree = parse_html_string(cont.encode("utf-8"))
        except:
            # Just ignore everything if we can not parse the chapter
            continue

        # Change all the links in the document
        for elem in tree.iter():
            if elem.tag == "a":
                href = elem.get("href")

                if href:
                    urlp = urlparse.urlparse(href)
                    url_title = urlp.path

                    if urlp.scheme == "":
                        if url_title and url_title in file_names:
                            fixed_href = url_title + ".xhtml"
                            if urlp.fragment:
                                fixed_href = "{}#{}".format(fixed_href, urlp.fragment)

                            elem.set("href", fixed_href)
                        else:
                            # ovdje brishe sve shto je externo. to se ne bi trebalo desavati
                            elem.drop_tag()

            c1.content = etree.tostring(tree, pretty_print=True, encoding="utf-8", xml_declaration=True)

        epub_book.add_item(c1)
        spine.append(c1)
        x += 1

    if _section_name is None and len(_section) > 0:
        _toc.append(_section)
    elif len(_section) > 0:
        _toc.append((epub.Section(_section_name), _section[:]))

    # Add all of the attachments
    for att_name in bookizip.get_attachments():
        try:
            blob = bookizip.read(att_name)
        except (IOError, OSError):
            continue
        else:
            itm = epub.EpubImage()
            itm.file_name = att_name
            itm.content = blob
            epub_book.add_item(itm)

    epub_book.set_title("Title", "main")
    epub_book.set_language("en")
    epub_book.add_author("Author", role="aut", uid="author")

    epub_book.toc = _toc
    epub_book.spine = spine

    epub_book.add_item(epub.EpubNcx())
    epub_book.add_item(epub.EpubNav())

    opts = {"plugins": [TidyPlugin(), standard.SyntaxPlugin()]}
    epub.write_epub(filename, epub_book, opts)

Example #40

0

Show file

File: standard.py Project: 171230839/ebooklib

    def html_before_write(self, book, chapter):
        from lxml import etree

        try:
            tree = parse_html_string(chapter.content)
        except:
            return

        root = tree.getroottree()

        # delete deprecated tags
        # i should really have a list of allowed tags
        for tag in DEPRECATED_TAGS:
            etree.strip_tags(root, tag)

        head = tree.find('head')
        
        if head is not None and len(head) != 0:
            
            for _item in head:
                if _item.tag == 'base':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['href', 'target'])
                elif _item.tag == 'link':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['href', 'crossorigin', 'rel', 'media', 'hreflang', 'type', 'sizes'])
                elif _item.tag == 'title':
                    if _item.text == '':
                        head.remove(_item)
                elif _item.tag == 'meta':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['name', 'http-equiv', 'content', 'charset'])
                    # just remove for now, but really should not be like this
                    head.remove(_item) 
                elif _item.tag == 'script':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'type', 'charset', 'async', 'defer', 'crossorigin'])
                elif _item.tag == 'source':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'type', 'media'])
                elif _item.tag == 'style':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['media', 'type', 'scoped'])
                else:
                    leave_only(_item, ATTRIBUTES_GLOBAL)


        if len(root.find('body')) != 0:
            body = tree.find('body')

            for _item in body.iter():
                # it is not
                # <a class="indexterm" href="ch05.html#ix_epub:trigger_element">
                
                if _item.tag == 'a':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['href', 'target', 'download', 'rel', 'hreflang', 'type'])
                elif _item.tag == 'area':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['alt', 'coords', 'shape', 'href', 'target', 'download', 'rel', 'hreflang', 'type'])
                elif _item.tag == 'audio':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'crossorigin', 'preload', 'autoplay', 'mediagroup', 'loop', 'muted', 'controls'])
                elif _item.tag == 'blockquote':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['cite'])
                elif _item.tag == 'button':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['autofocus', 'disabled', 'form', 'formaction', 'formenctype', 'formmethod', 'formnovalidate',
                                                           'formtarget', 'name', 'type', 'value', 'menu'])
                elif _item.tag == 'canvas':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['width', 'height'])
                elif _item.tag == 'canvas':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['width', 'height'])
                elif _item.tag == 'del':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['cite', 'datetime'])
                elif _item.tag == 'details':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['open'])
                elif _item.tag == 'embed':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'type', 'width', 'height'])
                elif _item.tag == 'fieldset':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['disable', 'form', 'name'])
                elif _item.tag == 'details':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['accept-charset', 'action', 'autocomplete', 'enctype', 'method', 'name', 'novalidate', 'target'])
                elif _item.tag == 'iframe':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'srcdoc', 'name', 'sandbox', 'seamless', 'allowfullscreen', 'width', 'height'])
                elif _item.tag == 'img':
                    _src =  _item.get('src', '').lower()
                    if _src.startswith('http://') or _src.startswith('https://'):
                        if 'remote-resources' not in chapter.properties:
                            chapter.properties.append('remote-resources')
                            # THIS DOES NOT WORK, ONLY VIDEO AND AUDIO FILES CAN BE REMOTE RESOURCES
                            # THAT MEANS I SHOULD ALSO CATCH <SOURCE TAG
                            from ebooklib import epub
                            _img = epub.EpubImage(file_name = _item.get('src'))
                            book.add_item(_img)
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['alt', 'src', 'crossorigin', 'usemap', 'ismap', 'width', 'height'])
                elif _item.tag == 'input':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['accept', 'alt', 'autocomplete', 'autofocus', 'checked', 'dirname',
                                                           'disabled', 'form', 'formaction', 'formenctype', 'formmethod', 'formnovalidate',
                                                           'formtarget', 'height', 'inputmode', 'list', 'max', 'maxlength', 'min', 'multiple',
                                                           'name', 'pattern', 'placeholder', 'readonly', 'required', 'size', 'src', 'step'
                                                           'type', 'value', 'width'])
                elif _item.tag == 'ins':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['cite', 'datetime'])
                elif _item.tag == 'keygen':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['autofocus', 'challenge', 'disabled', 'form', 'keytype', 'name'])
                elif _item.tag == 'label':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['form', 'for'])
                elif _item.tag == 'label':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['form', 'for'])
                elif _item.tag == 'map':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['name'])
                elif _item.tag == 'menu':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['type', 'label'])
                elif _item.tag == 'object':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['data', 'type', 'typemustmatch', 'name', 'usemap', 'form', 'width', 'height'])
                elif _item.tag == 'ol':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['reversed', 'start', 'type'])
                elif _item.tag == 'optgroup':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['disabled', 'label'])
                elif _item.tag == 'option':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['disabled', 'label', 'selected', 'value'])
                elif _item.tag == 'output':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['for', 'form', 'name'])
                elif _item.tag == 'param':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['name', 'value'])
                elif _item.tag == 'progress':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['value', 'max'])
                elif _item.tag == 'q':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['cite'])
                elif _item.tag == 'select':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['autofocus', 'disabled', 'form', 'multiple', 'name', 'required', 'size'])

                elif _item.tag == 'table':
                    if _item.get('border', None):
                        if _item.get('border') == '0':
                            _item.set('border', '')

                    if _item.get('summary', None):
                        _caption = etree.Element('caption', {})
                        _caption.text = _item.get('summary')
                        _item.insert(0, _caption)

                        # add it as caption
                        del _item.attrib['summary']

                    leave_only(_item, ATTRIBUTES_GLOBAL + ['border', 'sortable'])
                elif _item.tag == 'dl':
                    _d = _item.find('dd')
                    if _d is not None and len(_d) == 0:
                        pass

                        # http://html5doctor.com/the-dl-element/
                        # should be like this really
                        # some of the elements can be missing
                        # dl
                        #   dt
                        #   dd
                        #   dt
                        #   dd
                elif _item.tag == 'td':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['colspan', 'rowspan', 'headers'])
                elif _item.tag == 'textarea':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['autocomplete', 'autofocus', 'cols', 'dirname', 'disabled', 'form',
                                                           'inputmode', 'maxlength', 'name', 'placeholder', 'readonly', 'required',
                                                           'rows', 'wrap'])

                elif _item.tag in ['col', 'colgroup']:
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['span'])
                elif _item.tag == 'th':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['colspan', 'rowspan', 'headers', 'scope', 'abbr', 'sorted'])
                elif _item.tag in ['time']:
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['datetime'])
                elif _item.tag in ['track']:
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['kind', 'src', 'srclang', 'label', 'default'])
                elif _item.tag == 'video':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['src', 'crossorigin', 'poster', 'preload', 'autoplay', 'mediagroup',
                                                           'loop', 'muted', 'controls', 'width', 'height'])
                elif _item.tag == 'svg':
                    # We need to add property "svg" in case we have embeded svg file
                    if 'svg' not in chapter.properties:
                        chapter.properties.append('svg')
                        
                    if _item.get('viewbox', None):
                        del _item.attrib['viewbox']

                    if _item.get('preserveaspectratio', None):
                        del _item.attrib['preserveaspectratio']
                else:
                    for _attr in six.iterkeys(_item.attrib):
                        if _attr not in ATTRIBUTES_GLOBAL:
                            del _item.attrib[_attr]

        chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True)
        
        return chapter.content

Example #41

0

Show file

File: views.py Project: zeuser/Booktype

def export_book(input_file, filename):
    """Reads content of book in Booki.zip format and converts it to EPUB format.

    This function reads content of the book in Booki.zip file, creates new
    book in EPUB format and converts entire content into it. There are some
    things which are different in new EPUB format. One of them is how links 
    and interlinks are handled.
    """

    epub_book = ExportEpubBook()

    # Creating new EPUB file
    epub_book.add_prefix('bkterms', 'http://booktype.org/')

    # Read old Booki.zip format
    bookizip = BookiZip(input_file)

    _toc, _section, _section_name = [], [], None
    spine = ['nav']

    # Get filesnames of all the chapters/sections
    file_names = [file_name[6:-5] for _, file_name, _ in bookizip.get_toc()]

    x = 0
    for typ, file_name, title in bookizip.get_toc():
        # Ignore sections
        if typ == 1:
            if _section_name is None and len(_section) > 0:
                _toc.append(_section)
            elif len(_section) > 0:
                _toc.append((epub.Section(_section_name), _section[:]))

            _section_name = title
            _section = []
            continue

        # Create new chapter with new filename
        c1 = epub.EpubHtml(title=title,
                           file_name='{}.xhtml'.format(file_name[6:-5]))
        cont = unicode(bookizip.read(file_name), 'utf-8')
        _section.append(c1)

        try:
            tree = parse_html_string(cont.encode('utf-8'))
        except:
            # Just ignore everything if we can not parse the chapter
            continue

        # Change all the links in the document
        for elem in tree.iter():
            if elem.tag == 'a':
                href = elem.get('href')

                if href:
                    urlp = urlparse.urlparse(href)
                    url_title = urlp.path

                    if urlp.scheme == '':
                        if url_title and url_title in file_names:
                            fixed_href = url_title + '.xhtml'
                            if urlp.fragment:
                                fixed_href = "{}#{}".format(
                                    fixed_href, urlp.fragment)

                            elem.set('href', fixed_href)
                        else:
                            # ovdje brishe sve shto je externo. to se ne bi trebalo desavati
                            elem.drop_tag()

            c1.content = etree.tostring(tree,
                                        pretty_print=True,
                                        encoding='utf-8',
                                        xml_declaration=True)

        epub_book.add_item(c1)
        spine.append(c1)
        x += 1

    if _section_name is None and len(_section) > 0:
        _toc.append(_section)
    elif len(_section) > 0:
        _toc.append((epub.Section(_section_name), _section[:]))

    # Add all of the attachments
    for att_name in bookizip.get_attachments():
        try:
            blob = bookizip.read(att_name)
        except (IOError, OSError):
            continue
        else:
            itm = epub.EpubImage()
            itm.file_name = att_name
            itm.content = blob
            epub_book.add_item(itm)

    epub_book.set_title('Title', 'main')
    epub_book.set_language('en')
    epub_book.add_author('Author', role='aut', uid='author')

    epub_book.toc = _toc
    epub_book.spine = spine

    epub_book.add_item(epub.EpubNcx())
    epub_book.add_item(epub.EpubNav())

    opts = {'plugins': [TidyPlugin(), standard.SyntaxPlugin()]}
    epub.write_epub(filename, epub_book, opts)

Example #42

0

Show file

File: import_lyrics_wizard.py Project: maccesch/songscreen

    def _import_old_epub(self, lyrics_path):
        new_verse_pattern = re.compile(r"^\s*(\d)\.\s+(.+)$")
        no_and_title_pattern = re.compile(r"^\s*(\d+)\s+(.+)$")

        if not self.epubs_page.old_epub:
            return

        book = epub.read_epub(self.epubs_page.old_epub)

        for item in list(filter(lambda i: isinstance(i, epub.EpubHtml), book.items)):
            tree = parse_html_string(item.content).getroottree()

            titles = tree.xpath("//title/text()")
            if titles:
                title = titles[0]

                m = no_and_title_pattern.match(title)
                if m is None:
                    continue
                no, title = m.groups()

                markers = []
                marker = None

                for line_element in tree.xpath("//div[@class='pGroup']/*"):
                    if line_element.tag == 'p':
                        while line_element.getchildren():
                            line_element.getchildren()[0].drop_tag()

                        line_text = line_element.text

                        m = new_verse_pattern.match(line_text)
                        if m is not None:
                            verse_no, line_text = m.groups()
                            if marker is not None:
                                markers.append(marker)

                            marker = {
                                'name': str(verse_no),
                                'text': line_text,
                            }
                        else:
                            marker['text'] += "\n{}".format(line_text)

                    elif "chorus" in line_element.attrib['class']:
                        if marker is not None:
                            markers.append(marker)

                        marker = {
                            'name': line_element.getchildren()[0].text.strip().
                                    replace('(', '').replace(')', '').lower().capitalize(),
                            'text': "",
                        }

                        for chorus_line_element in line_element.getchildren()[1:]:
                            marker['text'] += "{}\n".format(chorus_line_element.text)

                        marker['text'] = marker['text'][:-1]

                markers.append(marker)

                with open(os.path.join(lyrics_path, "{}.json".format(no)), "w") as f:
                    json.dump({
                        'title': title,
                        'markers': markers,
                    }, f, indent=2)

Example #43

0

Show file

File: import_lyrics_wizard.py Project: maccesch/songscreen

    def _import_new_epub(self, lyrics_path):
        if not self.epubs_page.new_epub:
            return

        book = epub.read_epub(self.epubs_page.new_epub)

        for item in filter(lambda i: isinstance(i, epub.EpubHtml), book.items):
            tree = parse_html_string(item.content).getroottree()

            title = tree.xpath("//h1/strong/text()")

            if title:
                title = title[0]

                try:
                    song_no = int(tree.xpath("//head/title/text()")[0].split(" ", 1)[0])

                    markers = []
                    marker = None

                    for verse_no, verse_element in enumerate(tree.xpath("//div[@class='pGroup']/ol/li"), 1):
                        marker = {
                            'name': str(verse_no),
                            'text': '',
                        }
                        for line_element in verse_element.getchildren():
                            if line_element.tag == 'p' and not 'se' in line_element.attrib.get('class', ''):
                                while line_element.getchildren():
                                    line_element.getchildren()[0].drop_tag()

                                line_text = line_element.text.strip()

                                marker['text'] += "{}\n".format(line_text)

                            elif "chorus" in line_element.attrib['class']:
                                if marker is not None:
                                    marker['text'] = marker['text'][:-1]
                                    markers.append(marker)

                                marker = {
                                    'name': line_element.getchildren()[0].text.strip().
                                        replace('(', '').replace(')', '').lower().capitalize(),
                                    'text': "",
                                }

                                for chorus_line_element in line_element.getchildren()[1:]:
                                    marker['text'] += "{}\n".format(chorus_line_element.text)

                            else:
                                if marker is not None:
                                    marker['text'] = marker['text'][:-1]
                                    markers.append(marker)

                                marker = {
                                    'name': line_element.text.strip().replace('(', '').replace(')',
                                                                                               '').lower().capitalize(),
                                    'text': "",
                                }

                        marker['text'] = marker['text'][:-1]
                        markers.append(marker)

                    if markers:
                        with open(os.path.join(lyrics_path, "{}.json".format(song_no)), "w") as f:
                            json.dump({
                                'title': title,
                                'markers': markers,
                            }, f, indent=2)

                except ValueError:
                    pass

Example #44

0

Show file

    def html_before_write(self, book, chapter):
        from lxml import etree

        try:
            tree = parse_html_string(chapter.content)
        except:
            return

        root = tree.getroottree()

        # delete deprecated tags
        # i should really have a list of allowed tags
        for tag in DEPRECATED_TAGS:
            etree.strip_tags(root, tag)

        head = tree.find('head')

        if head is not None and len(head) != 0:

            for _item in head:
                if _item.tag == 'base':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['href', 'target'])
                elif _item.tag == 'link':
                    leave_only(
                        _item, ATTRIBUTES_GLOBAL + [
                            'href', 'crossorigin', 'rel', 'media', 'hreflang',
                            'type', 'sizes'
                        ])
                elif _item.tag == 'title':
                    if _item.text == '':
                        head.remove(_item)
                elif _item.tag == 'meta':
                    leave_only(
                        _item, ATTRIBUTES_GLOBAL +
                        ['name', 'http-equiv', 'content', 'charset'])
                    # just remove for now, but really should not be like this
                    head.remove(_item)
                elif _item.tag == 'script':
                    leave_only(
                        _item, ATTRIBUTES_GLOBAL + [
                            'src', 'type', 'charset', 'async', 'defer',
                            'crossorigin'
                        ])
                elif _item.tag == 'source':
                    leave_only(_item,
                               ATTRIBUTES_GLOBAL + ['src', 'type', 'media'])
                elif _item.tag == 'style':
                    leave_only(_item,
                               ATTRIBUTES_GLOBAL + ['media', 'type', 'scoped'])
                else:
                    leave_only(_item, ATTRIBUTES_GLOBAL)

        if len(root.find('body')) != 0:
            body = tree.find('body')

            for _item in body.iter():
                # it is not
                # <a class="indexterm" href="ch05.html#ix_epub:trigger_element">

                if _item.tag == 'a':
                    leave_only(
                        _item, ATTRIBUTES_GLOBAL + [
                            'href', 'target', 'download', 'rel', 'hreflang',
                            'type'
                        ])
                elif _item.tag == 'area':
                    leave_only(
                        _item, ATTRIBUTES_GLOBAL + [
                            'alt', 'coords', 'shape', 'href', 'target',
                            'download', 'rel', 'hreflang', 'type'
                        ])
                elif _item.tag == 'audio':
                    leave_only(
                        _item, ATTRIBUTES_GLOBAL + [
                            'src', 'crossorigin', 'preload', 'autoplay',
                            'mediagroup', 'loop', 'muted', 'controls'
                        ])
                elif _item.tag == 'blockquote':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['cite'])
                elif _item.tag == 'button':
                    leave_only(
                        _item, ATTRIBUTES_GLOBAL + [
                            'autofocus', 'disabled', 'form', 'formaction',
                            'formenctype', 'formmethod', 'formnovalidate',
                            'formtarget', 'name', 'type', 'value', 'menu'
                        ])
                elif _item.tag == 'canvas':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['width', 'height'])
                elif _item.tag == 'canvas':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['width', 'height'])
                elif _item.tag == 'del':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['cite', 'datetime'])
                elif _item.tag == 'details':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['open'])
                elif _item.tag == 'embed':
                    leave_only(
                        _item,
                        ATTRIBUTES_GLOBAL + ['src', 'type', 'width', 'height'])
                elif _item.tag == 'fieldset':
                    leave_only(_item,
                               ATTRIBUTES_GLOBAL + ['disable', 'form', 'name'])
                elif _item.tag == 'details':
                    leave_only(
                        _item, ATTRIBUTES_GLOBAL + [
                            'accept-charset', 'action', 'autocomplete',
                            'enctype', 'method', 'name', 'novalidate', 'target'
                        ])
                elif _item.tag == 'iframe':
                    leave_only(
                        _item, ATTRIBUTES_GLOBAL + [
                            'src', 'srcdoc', 'name', 'sandbox', 'seamless',
                            'allowfullscreen', 'width', 'height'
                        ])
                elif _item.tag == 'img':
                    _src = _item.get('src', '').lower()
                    if _src.startswith('http://') or _src.startswith(
                            'https://'):
                        if 'remote-resources' not in chapter.properties:
                            chapter.properties.append('remote-resources')
                            # THIS DOES NOT WORK, ONLY VIDEO AND AUDIO FILES CAN BE REMOTE RESOURCES
                            # THAT MEANS I SHOULD ALSO CATCH <SOURCE TAG
                            from ebooklib import epub
                            _img = epub.EpubImage(file_name=_item.get('src'))
                            book.add_item(_img)
                    leave_only(
                        _item, ATTRIBUTES_GLOBAL + [
                            'alt', 'src', 'crossorigin', 'usemap', 'ismap',
                            'width', 'height'
                        ])
                elif _item.tag == 'input':
                    leave_only(
                        _item, ATTRIBUTES_GLOBAL + [
                            'accept', 'alt', 'autocomplete', 'autofocus',
                            'checked', 'dirname', 'disabled', 'form',
                            'formaction', 'formenctype', 'formmethod',
                            'formnovalidate', 'formtarget', 'height',
                            'inputmode', 'list', 'max', 'maxlength', 'min',
                            'multiple', 'name', 'pattern', 'placeholder',
                            'readonly', 'required', 'size', 'src', 'step'
                            'type', 'value', 'width'
                        ])
                elif _item.tag == 'ins':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['cite', 'datetime'])
                elif _item.tag == 'keygen':
                    leave_only(
                        _item, ATTRIBUTES_GLOBAL + [
                            'autofocus', 'challenge', 'disabled', 'form',
                            'keytype', 'name'
                        ])
                elif _item.tag == 'label':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['form', 'for'])
                elif _item.tag == 'label':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['form', 'for'])
                elif _item.tag == 'map':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['name'])
                elif _item.tag == 'menu':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['type', 'label'])
                elif _item.tag == 'object':
                    leave_only(
                        _item, ATTRIBUTES_GLOBAL + [
                            'data', 'type', 'typemustmatch', 'name', 'usemap',
                            'form', 'width', 'height'
                        ])
                elif _item.tag == 'ol':
                    leave_only(
                        _item,
                        ATTRIBUTES_GLOBAL + ['reversed', 'start', 'type'])
                elif _item.tag == 'optgroup':
                    leave_only(_item,
                               ATTRIBUTES_GLOBAL + ['disabled', 'label'])
                elif _item.tag == 'option':
                    leave_only(
                        _item, ATTRIBUTES_GLOBAL +
                        ['disabled', 'label', 'selected', 'value'])
                elif _item.tag == 'output':
                    leave_only(_item,
                               ATTRIBUTES_GLOBAL + ['for', 'form', 'name'])
                elif _item.tag == 'param':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['name', 'value'])
                elif _item.tag == 'progress':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['value', 'max'])
                elif _item.tag == 'q':
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['cite'])
                elif _item.tag == 'select':
                    leave_only(
                        _item, ATTRIBUTES_GLOBAL + [
                            'autofocus', 'disabled', 'form', 'multiple',
                            'name', 'required', 'size'
                        ])

                elif _item.tag == 'table':
                    if _item.get('border', None):
                        if _item.get('border') == '0':
                            _item.set('border', '')

                    if _item.get('summary', None):
                        _caption = etree.Element('caption', {})
                        _caption.text = _item.get('summary')
                        _item.insert(0, _caption)

                        # add it as caption
                        del _item.attrib['summary']

                    leave_only(_item,
                               ATTRIBUTES_GLOBAL + ['border', 'sortable'])
                elif _item.tag == 'dl':
                    _d = _item.find('dd')
                    if _d is not None and len(_d) == 0:
                        pass

                        # http://html5doctor.com/the-dl-element/
                        # should be like this really
                        # some of the elements can be missing
                        # dl
                        #   dt
                        #   dd
                        #   dt
                        #   dd
                elif _item.tag == 'td':
                    leave_only(
                        _item,
                        ATTRIBUTES_GLOBAL + ['colspan', 'rowspan', 'headers'])
                elif _item.tag == 'textarea':
                    leave_only(
                        _item, ATTRIBUTES_GLOBAL + [
                            'autocomplete', 'autofocus', 'cols', 'dirname',
                            'disabled', 'form', 'inputmode', 'maxlength',
                            'name', 'placeholder', 'readonly', 'required',
                            'rows', 'wrap'
                        ])

                elif _item.tag in ['col', 'colgroup']:
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['span'])
                elif _item.tag == 'th':
                    leave_only(
                        _item, ATTRIBUTES_GLOBAL + [
                            'colspan', 'rowspan', 'headers', 'scope', 'abbr',
                            'sorted'
                        ])
                elif _item.tag in ['time']:
                    leave_only(_item, ATTRIBUTES_GLOBAL + ['datetime'])
                elif _item.tag in ['track']:
                    leave_only(
                        _item, ATTRIBUTES_GLOBAL +
                        ['kind', 'src', 'srclang', 'label', 'default'])
                elif _item.tag == 'video':
                    leave_only(
                        _item, ATTRIBUTES_GLOBAL + [
                            'src', 'crossorigin', 'poster', 'preload',
                            'autoplay', 'mediagroup', 'loop', 'muted',
                            'controls', 'width', 'height'
                        ])
                elif _item.tag == 'svg':
                    # We need to add property "svg" in case we have embeded svg file
                    if 'svg' not in chapter.properties:
                        chapter.properties.append('svg')

                    if _item.get('viewbox', None):
                        del _item.attrib['viewbox']

                    if _item.get('preserveaspectratio', None):
                        del _item.attrib['preserveaspectratio']
                else:
                    for _attr in six.iterkeys(_item.attrib):
                        if _attr not in ATTRIBUTES_GLOBAL:
                            del _item.attrib[_attr]

        chapter.content = etree.tostring(tree,
                                         pretty_print=True,
                                         encoding='utf-8',
                                         xml_declaration=True)

        return chapter.content

Example #45

0

Show file

File: misc.py Project: kronoscode/Booktype

def import_book_from_file(epub_file, user, **kwargs):
    import uuid

    from django.utils.timezone import utc
    from lxml import etree
    from ebooklib.utils import parse_html_string
    from .book import create_book

    opts = {'plugins': [TidyPlugin(), ImportPlugin()]}
    epub_book = epub.read_epub(epub_file, opts)

    chapters = {}
    toc = []

    def _parse_toc(elements, parent=None):
        for _elem in elements:
            # used later to get parent of an elem
            unique_id = uuid.uuid4().hex

            if isinstance(_elem, tuple):
                toc.append((1, _elem[0].title, unique_id, parent))
                _parse_toc(_elem[1], unique_id)
            elif isinstance(_elem, epub.Section):
                pass
            elif isinstance(_elem, epub.Link):
                _u = urlparse.urlparse(_elem.href)
                _name = urllib.unquote(os.path.basename(_u.path))
                if not _name:
                    _name = _elem.title

                if _name not in chapters:
                    chapters[_name] = _elem.title
                    toc.append((0, _name, unique_id, parent))

    _parse_toc(epub_book.toc)

    epub_book_name = epub_book.metadata[epub.NAMESPACES['DC']]['title'][0][0]
    title = kwargs.get('book_title', epub_book_name)
    book_url = kwargs.get('book_url', None)

    # must check if title already exists
    book = create_book(user, title, book_url=book_url)
    now = datetime.datetime.utcnow().replace(tzinfo=utc)
    stat = models.BookStatus.objects.filter(book=book, name="new")[0]

    for attach in epub_book.get_items_of_type(ebooklib.ITEM_IMAGE):
        att = models.Attachment(
            book=book,
            version=book.version,
            status=stat
        )

        s = attach.get_content()
        f = StringIO.StringIO(s)
        f2 = File(f)
        f2.size = len(s)
        att.attachment.save(attach.file_name, f2, save=False)
        att.save()
        f.close()

    _imported = {}
    # TODO: ask about importing empty sections

    for chap in epub_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
        # Nav and Cover are not imported
        if not chap.is_chapter():
            continue

        # check if this chapter name already exists
        name = urllib.unquote(os.path.basename(chap.file_name))
        content = chap.get_body_content()

        # maybe this part has to go to the plugin
        # but you can not get title from <title>
        if name in chapters:
            name = chapters[name]
        else:
            name = _convert_file_name(name)
            if name.rfind('.') != -1:
                name = name[:name.rfind('.')]
            name = name.replace('.', '')

        chapter = models.Chapter(
            book=book,
            version=book.version,
            url_title=booktype_slugify(unicode(name)),
            title=name,
            status=stat,
            content=content,
            created=now,
            modified=now
        )
        chapter.save()
        _imported[urllib.unquote(os.path.basename(chap.file_name))] = chapter

    # fix links
    for chap in epub_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
        if not chap.is_chapter():
            continue

        content = chap.get_content()
        try:
            tree = parse_html_string(content)
        except:
            pass

        root = tree.getroottree()

        if len(root.find('body')) != 0:
            body = tree.find('body')

            to_save = False

            for _item in body.iter():
                if _item.tag == 'a':
                    _href = _item.get('href')

                    if _href:
                        _u = urlparse.urlparse(_href)
                        pth = urllib.unquote(os.path.basename(_u.path))

                        if pth in _imported:
                            _name = _imported[pth].url_title

                            _u2 = urlparse.urljoin(_href, '../' + _name + '/')
                            _item.set('href', _u2)
                            to_save = True

            if to_save:
                chap.content = etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True)
                _imported[urllib.unquote(os.path.basename(chap.file_name))].content = chap.content
                _imported[urllib.unquote(os.path.basename(chap.file_name))].save()

    n = len(toc) + 1
    parents = {}

    for _elem in toc:
        if _elem[0] == 1:  # section
            toc_item = models.BookToc(
                book=book,
                version=book.version,
                name=_elem[1],
                chapter=None,
                weight=n,
                typeof=2
            )
        else:
            if not _elem[1] in _imported:
                continue

            chap = _imported[_elem[1]]
            toc_item = models.BookToc(
                book=book,
                version=book.version,
                name=chap.title,
                chapter=chap,
                weight=n,
                typeof=1
            )

        # check if elem has parent
        if _elem[3]:
            toc_item.parent = parents.get(_elem[3], None)
        toc_item.save()

        # decrease weight
        n -= 1

        # save temporarily the toc_item in parent
        parents[_elem[2]] = toc_item

    return book

Example #46

0

Show file

def import_book_from_file(epub_file, user, **kwargs):
    import uuid

    from django.utils.timezone import utc
    from lxml import etree
    from ebooklib.utils import parse_html_string
    from .book import create_book

    opts = {'plugins': [TidyPlugin(), ImportPlugin()]}
    epub_book = epub.read_epub(epub_file, opts)

    chapters = {}
    toc = []

    def _parse_toc(elements, parent=None):
        for _elem in elements:
            # used later to get parent of an elem
            unique_id = uuid.uuid4().hex

            if isinstance(_elem, tuple):
                toc.append((1, _elem[0].title, unique_id, parent))
                _parse_toc(_elem[1], unique_id)
            elif isinstance(_elem, epub.Section):
                pass
            elif isinstance(_elem, epub.Link):
                _u = urlparse.urlparse(_elem.href)
                _name = urllib.unquote(os.path.basename(_u.path))
                if not _name:
                    _name = _elem.title

                if _name not in chapters:
                    chapters[_name] = _elem.title
                    toc.append((0, _name, unique_id, parent))

    _parse_toc(epub_book.toc)

    epub_book_name = epub_book.metadata[epub.NAMESPACES['DC']]['title'][0][0]
    title = kwargs.get('book_title', epub_book_name)
    book_url = kwargs.get('book_url', None)

    # must check if title already exists
    book = create_book(user, title, book_url=book_url)
    now = datetime.datetime.utcnow().replace(tzinfo=utc)
    stat = models.BookStatus.objects.filter(book=book, name="new")[0]

    for attach in epub_book.get_items_of_type(ebooklib.ITEM_IMAGE):
        att = models.Attachment(book=book, version=book.version, status=stat)

        s = attach.get_content()
        f = StringIO.StringIO(s)
        f2 = File(f)
        f2.size = len(s)
        att.attachment.save(attach.file_name, f2, save=False)
        att.save()
        f.close()

    _imported = {}
    # TODO: ask about importing empty sections

    for chap in epub_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
        # Nav and Cover are not imported
        if not chap.is_chapter():
            continue

        # check if this chapter name already exists
        name = urllib.unquote(os.path.basename(chap.file_name))
        content = chap.get_body_content()

        # maybe this part has to go to the plugin
        # but you can not get title from <title>
        if name in chapters:
            name = chapters[name]
        else:
            name = _convert_file_name(name)
            if name.rfind('.') != -1:
                name = name[:name.rfind('.')]
            name = name.replace('.', '')

        chapter = models.Chapter(book=book,
                                 version=book.version,
                                 url_title=booktype_slugify(unicode(name)),
                                 title=name,
                                 status=stat,
                                 content=content,
                                 created=now,
                                 modified=now)
        chapter.save()
        _imported[urllib.unquote(os.path.basename(chap.file_name))] = chapter

    # fix links
    for chap in epub_book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
        if not chap.is_chapter():
            continue

        content = chap.get_content()
        try:
            tree = parse_html_string(content)
        except:
            pass

        root = tree.getroottree()

        if len(root.find('body')) != 0:
            body = tree.find('body')

            to_save = False

            for _item in body.iter():
                if _item.tag == 'a':
                    _href = _item.get('href')

                    if _href:
                        _u = urlparse.urlparse(_href)
                        pth = urllib.unquote(os.path.basename(_u.path))

                        if pth in _imported:
                            _name = _imported[pth].url_title

                            _u2 = urlparse.urljoin(_href, '../' + _name + '/')
                            _item.set('href', _u2)
                            to_save = True

            if to_save:
                chap.content = etree.tostring(tree,
                                              pretty_print=True,
                                              encoding='utf-8',
                                              xml_declaration=True)
                _imported[urllib.unquote(os.path.basename(
                    chap.file_name))].content = chap.content
                _imported[urllib.unquote(os.path.basename(
                    chap.file_name))].save()

    n = len(toc) + 1
    parents = {}

    for _elem in toc:
        if _elem[0] == 1:  # section
            toc_item = models.BookToc(book=book,
                                      version=book.version,
                                      name=_elem[1],
                                      chapter=None,
                                      weight=n,
                                      typeof=2)
        else:
            if not _elem[1] in _imported:
                continue

            chap = _imported[_elem[1]]
            toc_item = models.BookToc(book=book,
                                      version=book.version,
                                      name=chap.title,
                                      chapter=chap,
                                      weight=n,
                                      typeof=1)

        # check if elem has parent
        if _elem[3]:
            toc_item.parent = parents.get(_elem[3], None)
        toc_item.save()

        # decrease weight
        n -= 1

        # save temporarily the toc_item in parent
        parents[_elem[2]] = toc_item

    return book

Example #47

0

Show file

File: epub.py Project: eos87/ebooklib

    def get_content(self, default=None):
        """
        Returns content for this document as HTML string. Content will be of type 'str' (Python 2) or 'bytes' (Python 3).

        :Args:
          - default: Default value for the content if it is not defined.

        :Returns:
          Returns content of this document.
        """

        tree = parse_string(self.book.get_template(self._template_name))
        tree_root = tree.getroot()

        tree_root.set('lang', self.lang or self.book.language)
        tree_root.attrib['{%s}lang' % NAMESPACES['XML']] = self.lang or self.book.language

        # add to the head also
        #  <meta charset="utf-8" />

        try:
            html_tree = parse_html_string(self.content)
        except:
            return ''

        html_root = html_tree.getroottree()

        # create and populate head

        _head = etree.SubElement(tree_root, 'head')

        if self.title != '':
            _title = etree.SubElement(_head, 'title')
            _title.text = self.title

        for lnk in self.links:
            if lnk.get("type") == "text/javascript":
                _lnk = etree.SubElement(_head, 'script', lnk)
                # force <script></script>
                _lnk.text = ''
            else:
                _lnk = etree.SubElement(_head, 'link', lnk)

        # this should not be like this
        # head = html_root.find('head')
        # if head is not None:
        #     for i in head.getchildren():
        #         if i.tag == 'title' and self.title != '':
        #             continue
        #         _head.append(i)

        # create and populate body

        _body = etree.SubElement(tree_root, 'body')
        if self.direction:
            _body.set('dir', self.direction)

        body = html_tree.find('body')
        if body is not None:
            for i in body.getchildren():
                _body.append(i)

        tree_str = etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True)

        return tree_str

Example #48

0

Show file

 def init_links_of_html(self, html_item):
     html_tree = parse_html_string(html_item.content)
     for link in html_tree.getroottree().xpath('//*[local-name()="link"]'):
         item=epub.EpubItem(file_name= link.get('href'), media_type='text/css')
         html_item.add_item(item)