Python xpath Beispiele, libgutenberg.GutenbergGlobals.xpath Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: HTMLParser.py Projekt: verdetamadachi/ebookmaker

    def _make_coverpage_link (self):
        """ Insert a <link rel="coverpage"> in the html head.

        First we determine the coverpage url.  In HTML we find the
        coverpage by appling these rules:

          1. the image specified in <link rel='coverpage'>,
          2. the image with an id of 'coverpage' or
          3. the image with an url containing 'cover'
          4. the image with an url containing 'title'

        If one rule returns images we take the first one in document
        order, else we proceed with the next rule.
        """

        coverpages = xpath (self.xhtml, "//xhtml:link[@rel='coverpage']")
        for coverpage in coverpages:
            url = coverpage.get ('src')
            debug ("Found link to coverpage %s." % url)
            return   # already provided by user

        # look for a suitable candidate
        coverpages = xpath (self.xhtml, "//xhtml:img[@id='coverpage']")
        if not coverpages:
            coverpages = xpath (self.xhtml, "//xhtml:img[contains (@src, 'cover')]")
        if not coverpages:
            coverpages = xpath (self.xhtml, "//xhtml:img[contains (@src, 'title')]")

        for coverpage in coverpages:
            for head in xpath (self.xhtml, "/xhtml:html/xhtml:head"):
                url = coverpage.get ('src')
                head.append (parsers.em.link (rel = 'coverpage', href = url))
                debug ("Inserted link to coverpage %s." % url)

Beispiel #2

0

Datei anzeigen

Datei: __init__.py Projekt: verdetamadachi/ebookmaker

    def strip_links(xhtml, manifest):
        """ Strip all links to urls not in manifest.

        This includes <a href>, <link href> and <img src>
        Assume links and urls are already made absolute.

        """

        for link in xpath(xhtml, '//xhtml:a[@href]'):
            href = urllib.parse.urldefrag(link.get('href'))[0]
            if href not in manifest:
                debug("strip_links: Deleting <a> to %s not in manifest." %
                      href)
                del link.attrib['href']

        for link in xpath(xhtml, '//xhtml:link[@href]'):
            href = link.get('href')
            if href not in manifest:
                debug("strip_links: Deleting <link> to %s not in manifest." %
                      href)
                link.drop_tree()

        for image in xpath(xhtml, '//xhtml:img[@src]'):
            src = image.get('src')
            if src not in manifest:
                debug(
                    "strip_links: Deleting <img> with src %s not in manifest."
                    % src)
                image.tail = image.get('alt', '') + (image.tail or '')
                image.drop_tree()

Beispiel #3

0

Datei anzeigen

    def reflow_pre(xhtml):
        """ make <pre> reflowable.

        This helps a lot with readers like Sony's that cannot
        scroll horizontally.

        """
        def nbsp(matchobj):
            return (' ' * (len(matchobj.group(0)) - 1)) + ' '

        for pre in xpath(xhtml, "//xhtml:pre"):
            # white-space: pre-wrap would do fine
            # but it is not supported by OEB
            try:
                pre.tag = NS.xhtml.div
                writers.HTMLishWriter.add_class(pre, 'pgmonospaced')
                m = parsers.RE_GUTENBERG.search(pre.text)
                if m:
                    writers.HTMLishWriter.add_class(pre, 'pgheader')

                tail = pre.tail
                s = etree.tostring(pre,
                                   encoding=six.text_type,
                                   with_tail=False)
                s = s.replace('>\n', '>')  # eliminate that empty first line
                s = s.replace('\n', '<br/>')
                s = re.sub('  +', nbsp, s)
                div = etree.fromstring(s)
                div.tail = tail

                pre.getparent().replace(pre, div)

            except etree.XMLSyntaxError as what:
                exception("%s\n%s" % (s, what))
                raise

Beispiel #4

0

Datei anzeigen

    def strip_rst_dropcaps(xhtml):
        """ Replace <img class='dropcap'> with <span class='dropcap'>.

        """

        for e in xpath(xhtml, "//xhtml:img[@class ='dropcap']"):
            e.tag = NS.xhtml.span
            e.text = e.get('alt', '')

Beispiel #5

0

Datei anzeigen

    def strip_ins(xhtml):
        """
        Strip all <ins> tags.

        There's a bug in the epub validator that trips on class and
        title attributes in <ins> elements.

        """
        for ins in xpath(xhtml, '//xhtml:ins'):
            ins.drop_tag()

Beispiel #6

0

Datei anzeigen

    def strip_noepub(xhtml):
        """ Strip all <* class='x-ebookmaker-drop'> tags.

        As a way to tailor your html towards epub.

        """

        for e in xpath(xhtml,
                       "//xhtml:*[contains (@class, 'x-ebookmaker-drop')]"):
            e.drop_tree()

Beispiel #7

0

Datei anzeigen

    def _make_coverpage_link(self, coverpage_url=None):
        """ Insert a <link rel="coverpage"> in the html head
        using the image specified by the --cover command-line option
        """

        if coverpage_url:
            for head in xpath(self.xhtml, "/xhtml:html/xhtml:head"):
                head.append(
                    parsers.em.link(rel='coverpage', href=coverpage_url))
                debug("Inserted link to coverpage %s." % coverpage_url)
            return

Beispiel #8

0

Datei anzeigen

    def remove_coverpage(self, xhtml, url):
        """ Remove coverpage from flow.

        EPUB readers will display the coverpage from the manifest and
        if we don't remove it from flow it will be displayed twice.

        """
        for img in xpath(xhtml, '//xhtml:img[@src = $url]', url=url):
            debug("remove_coverpage: dropping <img> %s from flow" % url)
            img.drop_tree()
            return  # only the first one though

Beispiel #9

0

Datei anzeigen

    def add_dublincore(self, job, tree):
        """ Add dublin core metadata to <head>. """
        source = gg.archive2files(options.ebook, job.url)

        if hasattr(options.config, 'FILESDIR'):
            job.dc.source = source.replace(options.config.FILESDIR,
                                           options.config.PGURL)

        for head in xpath(tree, '//xhtml:head'):
            for e in job.dc.to_html():
                e.tail = '\n'
                head.append(e)

Beispiel #10

0

Datei anzeigen

    def fix_style_elements(xhtml):
        """ Fix CSS style elements.  Make sure they are utf-8. """

        # debug ("enter fix_style_elements")

        for style in xpath(xhtml, "//xhtml:style"):
            p = parsers.CSSParser.Parser()
            p.parse_string(style.text)
            try:
                # pylint: disable=E1103
                style.text = p.sheet.cssText.decode('utf-8')
            except (ValueError, UnicodeError):
                debug("CSS:\n%s" % p.sheet.cssText)
                raise

Beispiel #11

0

Datei anzeigen

    def manifest_item(self, url, mediatype, id_=None):
        """ Add item to manifest. """

        if id_ is None or xpath(self.manifest, "//*[@id = '%s']" % id_):
            self.item_id += 1
            id_ = 'item%d' % self.item_id

        self.manifest.append(
            self.opf.item(**{
                'href': url,
                'id': id_,
                'media-type': mediatype
            }))

        return id_

Beispiel #12

0

Datei anzeigen

Datei: __init__.py Projekt: cpeel/ebookmaker

    def iterlinks(self):
        """ Return all links in document. """

        # To keep an image even in non-image build specify
        # class="x-ebookmaker-important"

        keeps = xpath(self.xhtml,
                      "//img[contains (@class, 'x-ebookmaker-important')]")
        for keep in keeps:
            keep.set('rel', 'important')

        # iterate links

        for (elem, dummy_attribute, url, dummy_pos) in self.xhtml.iterlinks():
            yield url, elem

Beispiel #13

0

Datei anzeigen

    def strip_links(xhtml, manifest):
        """
        Strip all links to images.

        This does not strip inline images, only standalone images that
        are targets of links. EPUB does not allow that.

        """
        for link in xpath(xhtml, '//xhtml:a[@href]'):
            href = urllib.parse.urldefrag(link.get('href'))[0]
            if not manifest[href] in OPS_CONTENT_DOCUMENTS:
                debug(
                    "strip_links: Deleting <a> to non-ops-document-type: %s" %
                    href)
                del link.attrib['href']
                continue

Beispiel #14

0

Datei anzeigen

Datei: EpubWriter.py Projekt: aredwing/ebookmaker

    def add_coverpage(self, ocf, url):
        """ Add a coverpage for ADE and Kindle.

        The recommended cover size is 600x800 pixels (500 pixels on
        the smaller side is an absolute minimum). The cover page
        should be a color picture in JPEG format.

        """

        id_ = None

        # look for a manifest item with the right url
        for item in xpath(
                self.manifest,
                # cannot xpath for default namespace
                "//*[local-name () = 'item' and (starts-with (@media-type, 'image/jpeg') or starts-with(@media-type, 'image/png')) and @href = $url]",
                url=url):

            id_ = item.get('id')
            break

        # else use default cover page image
        if id_ is None:
            ext = url.split('.')[-1]
            try:
                mediatype = getattr(mt, ext)
            except AttributeError:
                mediatype = mt.jpeg
            try:
                with open(url, 'r') as f:
                    ocf.add_bytes(Writer.url2filename(url), f.read(),
                                  mediatype)
            except IOError:
                url = 'cover.jpg'
                ocf.add_bytes(url, resource_string('ebookmaker.writers', url),
                              mediatype)
            id_ = self.manifest_item(url, mediatype)

        debug("Adding coverpage id: %s url: %s" % (id_, url))

        # register mobipocket style
        self.meta_item('cover', id_)

        # register ADE style
        href = ocf.add_image_wrapper(Writer.url2filename(url), 'Cover')
        self.spine_item(href, mt.xhtml, 'coverpage-wrapper', True, True)
        self.guide_item(href, 'cover', 'Cover')

Beispiel #15

0

Datei anzeigen

    def insert_root_div(xhtml):
        """ Insert a div immediately below body and move body contents
        into it.

        Rationale: We routinely turn page numbers into <a> elements.
        <a> elements are illegal as children of body, but are legal as
        children of <div>. See: `strip_page_numbers ()`

        """
        em = ElementMaker(namespace=str(NS.xhtml), nsmap={None: str(NS.xhtml)})

        for body in xpath(xhtml, "/xhtml:body"):
            div = em.div
            div.set('id', 'pgepub-root-div')
            for child in body:
                div.append(child)
            body.append(div)

Beispiel #16

0

Datei anzeigen

    def fix_html_image_dimensions(xhtml):
        """

        Remove all width and height that is not specified in '%'.

        """

        for img in xpath(xhtml, '//xhtml:img'):
            a = img.attrib

            if '%' in a.get('width', '%') and '%' in a.get('height', '%'):
                continue

            if 'width' in a:
                del a['width']
            if 'height' in a:
                del a['height']

Beispiel #17

0

Datei anzeigen

    def get_classes_that_float(xhtml):
        """ Get a list of all classes that use float or position. """

        classes = set()
        regex = re.compile(r"\.(\w+)", re.ASCII)

        for style in xpath(xhtml, "//xhtml:style"):
            p = parsers.CSSParser.Parser()
            p.parse_string(style.text)

            for rule in p.sheet:
                if rule.type == rule.STYLE_RULE:
                    for p in rule.style:
                        if p.name in ('float', 'position'):
                            classes.update(
                                regex.findall(rule.selectorList.selectorText))
                            break

        return classes

Beispiel #18

0

Datei anzeigen

    def parse(self):
        """ Parse the plain text.

        Try to find semantic units in the character soup. """

        debug("GutenbergTextParser.parse () ...")

        if self.xhtml is not None:
            return

        text = self.unicode_content()
        text = parsers.RE_RESTRICTED.sub('', text)
        text = gg.xmlspecialchars(text)

        lines = [line.rstrip() for line in text.splitlines()]
        lines.append("")
        del text

        blanks = 0
        par = Par()

        for line in lines:
            if len(line) == 0:
                blanks += 1
            else:
                if blanks and par.lines:  # don't append empty pars
                    par.after = blanks
                    self.pars.append(par)
                    if self.body == 1:
                        self.max_blanks = max(blanks, self.max_blanks)
                    par = Par()
                    par.before = blanks
                    blanks = 0

                par.lines.append(line)

        par.after = blanks
        if par.lines:
            self.pars.append(par)

        lines = None

        self.analyze()

        # build xhtml tree

        em = parsers.em
        self.xhtml = em.html(
            em.head(
                em.title(' '),
                # pylint: disable=W0142
                em.meta(**{
                    'http-equiv': 'Content-Style-Type',
                    'content': 'text/css'
                }),
                em.meta(
                    **{
                        'http-equiv': 'Content-Type',
                        'content': mt.xhtml + '; charset=utf-8'
                    })),
            em.body())

        for body in xpath(self.xhtml, '//xhtml:body'):
            xhtmlparser = lxml.html.XHTMLParser()
            for par in self.pars:
                p = etree.fromstring(self.ship_out(par), xhtmlparser)
                p.tail = '\n\n'
                body.append(p)

        self.pars = []

Beispiel #19

0

Datei anzeigen

    def _to_xhtml11(self):
        """ Make vanilla xhtml more conform to xhtml 1.1 """

        # Change content-type meta to application/xhtml+xml.
        for meta in xpath(self.xhtml,
                          "/xhtml:html/xhtml:head/xhtml:meta[@http-equiv]"):
            if meta.get('http-equiv').lower() == 'content-type':
                meta.set('content', mt.xhtml + '; charset=utf-8')

        # drop javascript

        for script in xpath(self.xhtml, "//xhtml:script"):
            script.drop_tree()

        # drop form

        for form in xpath(self.xhtml, "//xhtml:form"):
            form.drop_tree()

        # blockquotes

        for bq in xpath(self.xhtml, "//xhtml:blockquote"):
            # no naked text allowed in <blockquote>
            div = etree.Element(NS.xhtml.div)
            for child in bq:
                div.append(child)
            div.text = bq.text
            bq.text = None
            bq.append(div)
            # lxml.html.defs.block_tags

        # insert tbody

        for table in xpath(self.xhtml, "//xhtml:table[xhtml:tr]"):
            # no naked <tr> allowed in <table>
            tbody = etree.Element(NS.xhtml.tbody)
            for tr in table:
                if tr.tag == NS.xhtml.tr:
                    tbody.append(tr)
            table.append(tbody)

        # move lang to xml:lang

        for elem in xpath(self.xhtml, "//xhtml:*[@lang]"):
            # bug in lxml 2.2.2: sometimes deletes wrong element
            # so we delete both and reset the right one
            lang = elem.get('lang')
            try:
                del elem.attrib[NS.xml.lang]
            except KeyError:
                pass
            del elem.attrib['lang']
            elem.set(NS.xml.lang, lang)

        # strip deprecated attributes

        for a, t in DEPRECATED.items():
            for tag in t.split():
                for elem in xpath(self.xhtml, "//xhtml:%s[@%s]" % (tag, a)):
                    del elem.attrib[a]

        # strip empty class attributes

        for elem in xpath(
                self.xhtml,
                "//xhtml:*[@class and normalize-space (@class) = '']"):
            del elem.attrib['class']

        # strip bogus header markup by Joe L.
        for elem in xpath(self.xhtml, "//xhtml:h1"):
            if elem.text and elem.text.startswith(
                    "The Project Gutenberg eBook"):
                elem.tag = NS.xhtml.p
        for elem in xpath(self.xhtml, "//xhtml:h3"):
            if elem.text and elem.text.startswith("E-text prepared by"):
                elem.tag = NS.xhtml.p

Beispiel #20

0

Datei anzeigen

Datei: __init__.py Projekt: verdetamadachi/ebookmaker

    def make_toc(self, xhtml):
        """ Build a TOC from HTML headers.

        Return a list of tuples (url, text, depth).

        Page numbers are also inserted because DTBook NCX needs the
        play_order to be sequential.

        """
        def id_generator(i=0):
            """ Generate an id for the TOC to link to. """
            while True:
                yield 'pgepubid%05d' % i
                i += 1

        idg = id_generator()

        def get_id(elem):
            """ Get the id of the element or generate and set one. """
            if not elem.get('id'):
                elem.set('id', six.next(idg))
            return elem.get('id')

        toc = []
        last_depth = 0

        for header in xpath(
                xhtml,
                '//xhtml:h1|//xhtml:h2|//xhtml:h3|//xhtml:h4|'
                # DP page number
                '//xhtml:*[contains (@class, "pageno")]|'
                # DocUtils contents header
                '//xhtml:p[contains (@class, "topic-title")]'):

            text = gg.normalize(
                etree.tostring(header,
                               method="text",
                               encoding=six.text_type,
                               with_tail=False))

            text = header.get('title', text).strip()

            if not text:
                # so <h2 title=""> may be used to suppress TOC entry
                continue

            if header.get('class', '').find('pageno') > -1:
                toc.append(
                    ["%s#%s" % (self.attribs.url, get_id(header)), text, -1])
            else:
                # header
                if text.lower().startswith('by '):
                    # common error in PG: <h2>by Lewis Carroll</h2> should
                    # yield no TOC entry
                    continue

                try:
                    depth = int(header.tag[-1:])
                except ValueError:
                    depth = 2  # avoid top level

                # fix bogus header numberings
                if depth > last_depth + 1:
                    depth = last_depth + 1

                last_depth = depth

                # if <h*> is first element of a <div> use <div> instead
                parent = header.getparent()
                if (parent.tag == NS.xhtml.div and parent[0] == header
                        and parent.text and parent.text.strip() == ''):
                    header = parent

                toc.append([
                    "%s#%s" % (self.attribs.url, get_id(header)), text, depth
                ])

        return toc

Beispiel #21

0

Datei anzeigen

    def strip_pagenumbers(xhtml, strip_classes):
        """ Strip dp page numbers.

        Rationale: DP implements page numbers either with float or
        with absolute positioning. Float is not supported by Kindle.
        Absolute positioning is not allowed in epub.

        If we'd leave these in, they would show up as numbers in the
        middle of the text.

        To still keep links working, we replace all page number
        contraptions we can find with empty <a>'s.

        """

        # look for elements with a class that is in strip_classes

        for class_ in strip_classes:
            xp = "//xhtml:*[@class and contains(concat(' ', normalize-space(@class), ' '), ' %s ')]" % class_

            count = 0
            for elem in xpath(xhtml, xp):

                # save textual content
                text = gg.normalize(
                    etree.tostring(elem,
                                   method="text",
                                   encoding=six.text_type,
                                   with_tail=False))
                if len(text) > 10:
                    # safeguard against removing things that are not pagenumbers
                    continue

                if not text:
                    text = elem.get('title')

                # look for id anywhere inside element
                id_ = elem.xpath(".//@id")

                # transmogrify element into empty <a>
                tail = elem.tail
                elem.clear()
                elem.tag = NS.xhtml.a
                if id_:
                    # some blockheaded PPers include more than
                    # one page number in one span. take the last id
                    # because the others represent empty pages.
                    elem.set('id', id_[-1])

                if class_ in DP_PAGENUMBER_CLASSES:
                    # mark element as rewritten pagenumber. we
                    # actually don't use this class for styling
                    # because it is on an empty element
                    elem.set('class', 'x-ebookmaker-pageno')

                if text:
                    elem.set('title', text)
                elem.tail = tail
                count += 1

                # The OPS Spec 2.0 is very clear: "Reading Systems
                # must be XML processors as defined in XML 1.1."
                # Nevertheless many browser-plugin ebook readers use
                # the HTML parsers of the browser.  But HTML parsers
                # don't grok the minimized form of empty elements.
                #
                # This will force lxml to output the non-minimized form
                # of the element.
                elem.text = ''

            if count:
                warning("%d elements having class %s have been rewritten." %
                        (count, class_))

Beispiel #22

0

Datei anzeigen

    def _fix_anchors(self):
        """ Move name to id and fix hrefs and ids. """

        # move anchor name to id
        # 'id' values are more strict than 'name' values
        # try to fix ill-formed ids

        seen_ids = set()

        for anchor in (xpath(self.xhtml, "//xhtml:a[@name]") +
                       xpath(self.xhtml, "//xhtml:*[@id]")):
            id_ = anchor.get('id') or anchor.get('name')

            if 'name' in anchor.attrib:
                del anchor.attrib['name']
            if 'id' in anchor.attrib:
                del anchor.attrib['id']
            if NS.xml.id in anchor.attrib:
                del anchor.attrib[NS.xml.id]

            id_ = self._fix_id(id_)

            if not parsers.RE_XML_NAME.match(id_):
                error("Dropping ill-formed id '%s' in %s" %
                      (id_, self.attribs.url))
                continue

            # well-formed id
            if id_ in seen_ids:
                error("Dropping duplicate id '%s' in %s" %
                      (id_, self.attribs.url))
                continue

            seen_ids.add(id_)
            anchor.set('id', id_)

        # try to fix bogus fragment ids
        # 1. fragments point to xml:id, so must be well-formed ids
        # 2. the ids they point to must exist

        for link in xpath(self.xhtml, "//xhtml:*[@href]"):
            href = link.get('href')
            hre, frag = urllib.parse.urldefrag(href)
            if frag:
                frag = self._fix_internal_frag(frag)

                if not frag:
                    # non-recoverable ill-formed frag
                    del link.attrib['href']
                    self.add_class(link, 'pgkilled')
                    error('Dropping ill-formed frag in %s' % href)
                    continue

                # well-formed frag
                if hre:
                    # we have url + frag
                    link.set(
                        'href', "%s#%s" %
                        (hre, urllib.parse.quote(frag.encode('utf-8'))))
                    self.add_class(link, 'pgexternal')
                elif frag in seen_ids:
                    # we have only frag
                    link.set('href',
                             "#%s" % urllib.parse.quote(frag.encode('utf-8')))
                    self.add_class(link, 'pginternal')
                else:
                    del link.attrib['href']
                    self.add_class(link, 'pgkilled')
                    error("Dropping frag to non-existing id in %s" % href)

Beispiel #23

0

Datei anzeigen

    def build(self, job):
        """ Build epub """

        ncx = TocNCX(job.dc)
        parsers = []
        css_count = 0

        # add CSS parser
        self.add_external_css(job.spider, None, PRIVATE_CSS, 'pgepub.css')

        try:
            chunker = HTMLChunker.HTMLChunker()
            coverpage_url = None

            # do images early as we need the new dimensions later
            for p in job.spider.parsers:
                if hasattr(p, 'resize_image'):
                    if 'coverpage' in p.attribs.rel:
                        if job.maintype == 'kindle':
                            np = p.resize_image(MAX_IMAGE_SIZE_KINDLE,
                                                MAX_COVER_DIMEN_KINDLE, 'jpeg')
                        else:
                            np = p.resize_image(MAX_IMAGE_SIZE,
                                                MAX_COVER_DIMEN)
                        np.id = p.attribs.get('id', 'coverpage')
                        coverpage_url = p.attribs.url
                    else:
                        if job.maintype == 'kindle':
                            np = p.resize_image(MAX_IMAGE_SIZE_KINDLE,
                                                MAX_IMAGE_DIMEN_KINDLE)
                        else:
                            np = p.resize_image(MAX_IMAGE_SIZE,
                                                MAX_IMAGE_DIMEN)
                        np.id = p.attribs.get('id')
                    parsers.append(np)

            for p in job.spider.parsers:
                if p.mediatype() in OPS_CONTENT_DOCUMENTS:
                    debug("URL: %s" % p.attribs.url)

                    if hasattr(p, 'rst2epub2'):
                        xhtml = p.rst2epub2(job)

                        if options.verbose >= 2:
                            # write html to disk for debugging
                            debugfilename = os.path.join(
                                job.outputdir, job.outputfile)
                            debugfilename = os.path.splitext (debugfilename)[0] + '.' + \
                                job.maintype + '.debug.html'
                            with open(debugfilename, 'wb') as fp:
                                fp.write(
                                    etree.tostring(xhtml, encoding='utf-8'))

                    else:
                        # make a copy so we can mess around
                        p.parse()
                        xhtml = copy.deepcopy(p.xhtml)

                    strip_classes = self.get_classes_that_float(xhtml)
                    strip_classes = strip_classes.intersection(STRIP_CLASSES)
                    if strip_classes:
                        self.strip_pagenumbers(xhtml, strip_classes)

                    # build up TOC
                    # has side effects on xhtml
                    ncx.toc += p.make_toc(xhtml)

                    self.insert_root_div(xhtml)
                    self.fix_charset(xhtml)
                    self.fix_style_elements(xhtml)
                    self.reflow_pre(xhtml)

                    # strip all links to items not in manifest
                    p.strip_links(xhtml, job.spider.dict_urls_mediatypes())
                    self.strip_links(xhtml, job.spider.dict_urls_mediatypes())

                    self.strip_noepub(xhtml)
                    # self.strip_rst_dropcaps (xhtml)

                    self.fix_html_image_dimensions(xhtml)
                    if coverpage_url:
                        self.remove_coverpage(xhtml, coverpage_url)

                    # externalize and fix CSS
                    for style in xpath(xhtml, '//xhtml:style'):
                        self.add_external_css(job.spider, xhtml, style.text,
                                              "%d.css" % css_count)
                        css_count += 1
                        style.drop_tree()

                    self.add_external_css(job.spider, xhtml, None,
                                          'pgepub.css')

                    self.add_meta_generator(xhtml)

                    debug("Splitting %s ..." % p.attribs.url)
                    chunker.next_id = 0
                    chunker.split(xhtml, p.attribs)

            for p in job.spider.parsers:
                if hasattr(p, 'sheet'):
                    self.fix_incompatible_css(p.sheet)
                    p.rewrite_links(self.url2filename)
                    parsers.append(p)

            # after splitting html into chunks we have to rewrite all
            # internal links in HTML
            chunker.rewrite_internal_links()
            # also in the TOC
            if not ncx.toc:
                ncx.toc.append([job.spider.parsers[0].attribs.url, 'Start', 1])
            chunker.rewrite_internal_links_toc(ncx.toc)

            # make absolute links zip-filename-compatible
            chunker.rewrite_links(self.url2filename)
            ncx.rewrite_links(self.url2filename)

            # Do away with the chunker and copy all chunks into new parsers.
            # These are fake parsers that never actually parsed anything,
            # we just use them to just hold our data.
            for chunk, attribs in chunker.chunks:
                p = ParserFactory.ParserFactory.get(attribs)
                p.xhtml = chunk
                parsers.append(p)

            self.shipout(job, parsers, ncx)

        except Exception as what:
            exception("Error building Epub: %s" % what)
            raise