Ejemplo n.º 1
0
    def html_after_read(self, book, chapter):
        if not chapter.content:
            return None

        (_, chapter.content) = tidy_cleanup(chapter.get_content(), **self.options)

        return chapter.content
Ejemplo n.º 2
0
    def html_after_read(self, book, chapter):
        if not chapter.content:
            return None

        (_, chapter.content) = tidy_cleanup(chapter.get_content(),
                                            **self.options)

        return chapter.content
Ejemplo n.º 3
0
    def _fix_merged_content(self, content):
        """
        After concatenated content, we need to make sure the structure of chapter is correct.
        For example: endnotes should always go at the end of the chapter and fix endnotes references

        Args:
          - content (`str`) chapter's html string content
        """

        # there might be some cases where the html content of the chapter it's no fully clean
        # because of old importer class. Let's apply some cleanup just in case
        content = tidy_cleanup(content, **TidyPlugin.OPTIONS)[1]

        utf8_parser = html.HTMLParser(encoding='utf-8')
        tree = html.document_fromstring(content, parser=utf8_parser)

        # first we check if there are endnotes
        endnotes = tree.xpath('.//ol[@class="endnotes"]')
        if len(endnotes) == 0:
            return content

        # let's create a new final block of endnotes to merge them all
        endnotes_block = etree.SubElement(tree.find('body'), 'ol',
                                          {'class': 'endnotes'})
        idx_endnote = 1

        for note in tree.xpath('.//li[starts-with(@id, "endnote-")]'):
            key = note.get('id', '').replace('endnote-', '')

            # let's find the reference, otherwise we delete the endnote
            try:
                sup = tree.xpath('//sup[@data-id="{}"]'.format(key))[0]
            except IndexError:
                self.notifier.warning(
                    _("Reference not found for endnote: {}").format(note.text))
                continue

            # make sure reference number is correct
            sup.text = '{}'.format(idx_endnote)
            idx_endnote += 1

            endnotes_block.append(note)

        # remove old endnotes block from content
        for oldblock in endnotes:
            oldblock.drop_tree()

        # let's use only body and its content
        return etree.tostring(tree.find('body'),
                              encoding='utf-8',
                              xml_declaration=False)