Ejemplos de urldefrag en Python, ejemplos de polyglot.urllib.urldefrag en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: reader.py Proyecto: jimman2003/calibre

 def _toc_from_html(self, opf):
     if 'toc' not in self.oeb.guide:
         return False
     self.log.debug('Reading TOC from HTML...')
     itempath, frag = urldefrag(self.oeb.guide['toc'].href)
     item = self.oeb.manifest.hrefs[itempath]
     html = item.data
     if frag:
         elems = xpath(html, './/*[@id="%s"]' % frag)
         if not elems:
             elems = xpath(html, './/*[@name="%s"]' % frag)
         elem = elems[0] if elems else html
         while elem != html and not xpath(elem, './/h:a[@href]'):
             elem = elem.getparent()
         html = elem
     titles = defaultdict(list)
     order = []
     for anchor in xpath(html, './/h:a[@href]'):
         href = anchor.attrib['href']
         href = item.abshref(urlnormalize(href))
         path, frag = urldefrag(href)
         if path not in self.oeb.manifest.hrefs:
             continue
         title = xml2text(anchor)
         title = COLLAPSE_RE.sub(' ', title.strip())
         if href not in titles:
             order.append(href)
         titles[href].append(title)
     toc = self.oeb.toc
     for href in order:
         toc.add(' '.join(titles[href]), href)
     return True

Ejemplo n.º 2

0

Mostrar archivo

Archivo: reader.py Proyecto: j-howell/calibre

 def _toc_from_html(self, opf):
     if 'toc' not in self.oeb.guide:
         return False
     self.log.debug('Reading TOC from HTML...')
     itempath, frag = urldefrag(self.oeb.guide['toc'].href)
     item = self.oeb.manifest.hrefs[itempath]
     html = item.data
     if frag:
         elems = xpath(html, './/*[@id="%s"]' % frag)
         if not elems:
             elems = xpath(html, './/*[@name="%s"]' % frag)
         elem = elems[0] if elems else html
         while elem != html and not xpath(elem, './/h:a[@href]'):
             elem = elem.getparent()
         html = elem
     titles = defaultdict(list)
     order = []
     for anchor in xpath(html, './/h:a[@href]'):
         href = anchor.attrib['href']
         href = item.abshref(urlnormalize(href))
         path, frag = urldefrag(href)
         if path not in self.oeb.manifest.hrefs:
             continue
         title = xml2text(anchor)
         title = COLLAPSE_RE.sub(' ', title.strip())
         if href not in titles:
             order.append(href)
         titles[href].append(title)
     toc = self.oeb.toc
     for href in order:
         toc.add(' '.join(titles[href]), href)
     return True

Ejemplo n.º 3

0

Mostrar archivo

Archivo: serializer.py Proyecto: smdx023/calibre

 def serialize_href(self, href, base=None):
     '''
     Serialize the href attribute of an <a> or <reference> tag. It is
     serialized as filepos="000000000" and a pointer to its location is
     stored in self.href_offsets so that the correct value can be filled in
     at the end.
     '''
     hrefs = self.oeb.manifest.hrefs
     try:
         path, frag = urldefrag(urlnormalize(href))
     except ValueError:
         # Unparsable URL
         return False
     if path and base:
         path = base.abshref(path)
     if path and path not in hrefs:
         return False
     buf = self.buf
     item = hrefs[path] if path else None
     if item and item.spine_position is None:
         return False
     path = item.href if item else base.href
     href = '#'.join((path, frag)) if frag else path
     buf.write(b'filepos=')
     self.href_offsets[href].append(buf.tell())
     buf.write(b'0000000000')
     return True

Ejemplo n.º 4

0

Mostrar archivo

Archivo: filenames.py Proyecto: JimmXinu/calibre

    def __call__(self, oeb, opts):
        import css_parser
        self.log = oeb.logger
        self.opts = opts
        self.oeb = oeb

        for item in oeb.manifest.items:
            self.current_item = item
            if etree.iselement(item.data):
                rewrite_links(self.current_item.data, self.url_replacer)
            elif hasattr(item.data, 'cssText'):
                css_parser.replaceUrls(item.data, self.url_replacer)

        if self.oeb.guide:
            for ref in self.oeb.guide.values():
                href = urlnormalize(ref.href)
                href, frag = urldefrag(href)
                replacement = self.rename_map.get(href, None)
                if replacement is not None:
                    nhref = replacement
                    if frag:
                        nhref += '#' + frag
                    ref.href = nhref

        if self.oeb.toc:
            self.fix_toc_entry(self.oeb.toc)

Ejemplo n.º 5

0

Mostrar archivo

Archivo: serializer.py Proyecto: JimmXinu/calibre

 def serialize_href(self, href, base=None):
     '''
     Serialize the href attribute of an <a> or <reference> tag. It is
     serialized as filepos="000000000" and a pointer to its location is
     stored in self.href_offsets so that the correct value can be filled in
     at the end.
     '''
     hrefs = self.oeb.manifest.hrefs
     try:
         path, frag = urldefrag(urlnormalize(href))
     except ValueError:
         # Unparseable URL
         return False
     if path and base:
         path = base.abshref(path)
     if path and path not in hrefs:
         return False
     buf = self.buf
     item = hrefs[path] if path else None
     if item and item.spine_position is None:
         return False
     path = item.href if item else base.href
     href = '#'.join((path, frag)) if frag else path
     buf.write(b'filepos=')
     self.href_offsets[href].append(buf.tell())
     buf.write(b'0000000000')
     return True

Ejemplo n.º 6

0

Mostrar archivo

Archivo: serializer.py Proyecto: JimmXinu/calibre

    def serialize_guide(self):
        '''
        The Kindle decides where to open a book based on the presence of
        an item in the guide that looks like
        <reference type="text" title="Start" href="chapter-one.xhtml"/>

        Similarly an item with type="toc" controls where the Goto Table of
        Contents operation on the kindle goes.
        '''

        buf = self.buf
        hrefs = self.oeb.manifest.hrefs
        buf.write(b'<guide>')
        for ref in self.oeb.guide.values():
            path = urldefrag(ref.href)[0]
            if path not in hrefs or hrefs[path].media_type not in OEB_DOCS:
                continue

            buf.write(b'<reference type="')
            if ref.type.startswith('other.') :
                self.serialize_text(ref.type.replace('other.',''), quot=True)
            else:
                self.serialize_text(ref.type, quot=True)
            buf.write(b'" ')
            if ref.title is not None:
                buf.write(b'title="')
                self.serialize_text(ref.title, quot=True)
                buf.write(b'" ')
                if is_guide_ref_start(ref):
                    self._start_href = ref.href
            self.serialize_href(ref.href)
            # Space required or won't work, I kid you not
            buf.write(b' />')

        buf.write(b'</guide>')

Ejemplo n.º 7

0

Mostrar archivo

 def map_resources(self, oeb_book):
     for item in oeb_book.manifest:
         if item.media_type in OEB_IMAGES:
             if item.href not in self.images:
                 ext = os.path.splitext(item.href)[1]
                 fname = '%s%s' % (len(self.images), ext)
                 fname = fname.zfill(10)
                 self.images[item.href] = fname
         if item in oeb_book.spine:
             self.get_link_id(item.href)
             root = item.data.find(XHTML('body'))
             link_attrs = set(html.defs.link_attrs)
             link_attrs.add(XLINK('href'))
             for el in root.iter():
                 attribs = el.attrib
                 try:
                     if not isinstance(el.tag, string_or_bytes):
                         continue
                 except:
                     continue
                 for attr in attribs:
                     if attr in link_attrs:
                         href = item.abshref(attribs[attr])
                         href, id = urldefrag(href)
                         if href in self.base_hrefs:
                             self.get_link_id(href, id)

Ejemplo n.º 8

0

Mostrar archivo

    def __call__(self, oeb, opts):
        import css_parser
        self.log = oeb.logger
        self.opts = opts
        self.oeb = oeb

        for item in oeb.manifest.items:
            self.current_item = item
            if etree.iselement(item.data):
                rewrite_links(self.current_item.data, self.url_replacer)
            elif hasattr(item.data, 'cssText'):
                css_parser.replaceUrls(item.data, self.url_replacer)

        if self.oeb.guide:
            for ref in self.oeb.guide.values():
                href = urlnormalize(ref.href)
                href, frag = urldefrag(href)
                replacement = self.rename_map.get(href, None)
                if replacement is not None:
                    nhref = replacement
                    if frag:
                        nhref += '#' + frag
                    ref.href = nhref

        if self.oeb.toc:
            self.fix_toc_entry(self.oeb.toc)

Ejemplo n.º 9

0

Mostrar archivo

Archivo: serializer.py Proyecto: smdx023/calibre

    def serialize_guide(self):
        '''
        The Kindle decides where to open a book based on the presence of
        an item in the guide that looks like
        <reference type="text" title="Start" href="chapter-one.xhtml"/>

        Similarly an item with type="toc" controls where the Goto Table of
        Contents operation on the kindle goes.
        '''

        buf = self.buf
        hrefs = self.oeb.manifest.hrefs
        buf.write(b'<guide>')
        for ref in self.oeb.guide.values():
            path = urldefrag(ref.href)[0]
            if path not in hrefs or hrefs[path].media_type not in OEB_DOCS:
                continue

            buf.write(b'<reference type="')
            if ref.type.startswith('other.'):
                self.serialize_text(ref.type.replace('other.', ''), quot=True)
            else:
                self.serialize_text(ref.type, quot=True)
            buf.write(b'" ')
            if ref.title is not None:
                buf.write(b'title="')
                self.serialize_text(ref.title, quot=True)
                buf.write(b'" ')
                if is_guide_ref_start(ref):
                    self._start_href = ref.href
            self.serialize_href(ref.href)
            # Space required or won't work, I kid you not
            buf.write(b' />')

        buf.write(b'</guide>')

Ejemplo n.º 10

0

Mostrar archivo

Archivo: oeb2html.py Proyecto: j-howell/calibre

 def map_resources(self, oeb_book):
     for item in oeb_book.manifest:
         if item.media_type in OEB_IMAGES:
             if item.href not in self.images:
                 ext = os.path.splitext(item.href)[1]
                 fname = '%s%s' % (len(self.images), ext)
                 fname = fname.zfill(10)
                 self.images[item.href] = fname
         if item in oeb_book.spine:
             self.get_link_id(item.href)
             root = item.data.find(XHTML('body'))
             link_attrs = set(html.defs.link_attrs)
             link_attrs.add(XLINK('href'))
             for el in root.iter():
                 attribs = el.attrib
                 try:
                     if not isinstance(el.tag, string_or_bytes):
                         continue
                 except:
                     continue
                 for attr in attribs:
                     if attr in link_attrs:
                         href = item.abshref(attribs[attr])
                         href, id = urldefrag(href)
                         if href in self.base_hrefs:
                             self.get_link_id(href, id)

Ejemplo n.º 11

0

Mostrar archivo

Archivo: reader.py Proyecto: zwlistu/calibre

    def _toc_from_navpoint(self, item, toc, navpoint):
        children = xpath(navpoint, 'ncx:navPoint')
        for child in children:
            title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()'))
            title = COLLAPSE_RE.sub(' ', title.strip())
            href = xpath(child, 'ncx:content/@src')
            if not title:
                self._toc_from_navpoint(item, toc, child)
                continue
            if (not href or not href[0]) and not xpath(child, 'ncx:navPoint'):
                # This node is useless
                continue
            href = item.abshref(urlnormalize(href[0])) if href and href[0] else ''
            path, _ = urldefrag(href)
            if path and path not in self.oeb.manifest.hrefs:
                path = urlnormalize(path)
            if href and path not in self.oeb.manifest.hrefs:
                self.logger.warn('TOC reference %r not found' % href)
                gc = xpath(child, 'ncx:navPoint')
                if not gc:
                    # This node is useless
                    continue
            id = child.get('id')
            klass = child.get('class', 'chapter')

            try:
                po = int(child.get('playOrder', self.oeb.toc.next_play_order()))
            except:
                po = self.oeb.toc.next_play_order()

            authorElement = xpath(child,
                    'descendant::calibre:meta[@name = "author"]')
            if authorElement:
                author = authorElement[0].text
            else:
                author = None

            descriptionElement = xpath(child,
                    'descendant::calibre:meta[@name = "description"]')
            if descriptionElement:
                description = etree.tostring(descriptionElement[0],
                method='text', encoding='unicode').strip()
                if not description:
                    description = None
            else:
                description = None

            index_image = xpath(child,
                    'descendant::calibre:meta[@name = "toc_thumbnail"]')
            toc_thumbnail = (index_image[0].text if index_image else None)
            if not toc_thumbnail or not toc_thumbnail.strip():
                toc_thumbnail = None

            node = toc.add(title, href, id=id, klass=klass,
                    play_order=po, description=description, author=author,
                           toc_thumbnail=toc_thumbnail)

            self._toc_from_navpoint(item, node, child)

Ejemplo n.º 12

0

Mostrar archivo

Archivo: mobi8.py Proyecto: smdx023/calibre

    def write_opf(self, guide, toc, spine, resource_map):
        mi = self.header.exth.mi
        if (self.cover_offset is not None
                and self.cover_offset < len(resource_map)):
            mi.cover = resource_map[self.cover_offset]

        if len(list(toc)) < 2:
            self.log.warn('KF8 has no metadata Table of Contents')

            for ref in guide:
                if ref.type == 'toc':
                    href = ref.href()
                    href, frag = urldefrag(href)
                    if os.path.exists(href.replace('/', os.sep)):
                        try:
                            toc = self.read_inline_toc(href, frag)
                        except:
                            self.log.exception('Failed to read inline ToC')

        opf = OPFCreator(os.getcwd(), mi)
        opf.guide = guide

        def exclude(path):
            return os.path.basename(path) == 'debug-raw.html'

        # If there are no images then the azw3 input plugin dumps all
        # binary records as .unknown images, remove them
        if self.for_tweak and os.path.exists('images') and os.path.isdir(
                'images'):
            files = os.listdir('images')
            unknown = [x for x in files if x.endswith('.unknown')]
            if len(files) == len(unknown):
                [os.remove('images/' + f) for f in files]

        if self.for_tweak:
            try:
                os.remove('debug-raw.html')
            except:
                pass

        opf.create_manifest_from_files_in([os.getcwd()], exclude=exclude)
        for entry in opf.manifest:
            if entry.mime_type == 'text/html':
                entry.mime_type = 'application/xhtml+xml'
        opf.create_spine(spine)
        opf.set_toc(toc)
        ppd = getattr(self.header.exth, 'page_progression_direction', None)
        if ppd in {'ltr', 'rtl', 'default'}:
            opf.page_progression_direction = ppd
        pwm = getattr(self.header.exth, 'primary_writing_mode', None)
        if pwm is not None:
            opf.primary_writing_mode = pwm

        with open('metadata.opf', 'wb') as of, open('toc.ncx', 'wb') as ncx:
            opf.render(of, ncx, 'toc.ncx')
        return 'metadata.opf'

Ejemplo n.º 13

0

Mostrar archivo

 def __call__(self, oeb, context):
     import css_parser
     oeb.logger.info('Trimming unused files from manifest...')
     self.opts = context
     used = set()
     for term in oeb.metadata:
         for item in oeb.metadata[term]:
             if item.value in oeb.manifest.hrefs:
                 used.add(oeb.manifest.hrefs[item.value])
             elif item.value in oeb.manifest.ids:
                 used.add(oeb.manifest.ids[item.value])
     for ref in oeb.guide.values():
         path, _ = urldefrag(ref.href)
         if path in oeb.manifest.hrefs:
             used.add(oeb.manifest.hrefs[path])
     # TOC items are required to be in the spine
     for item in oeb.spine:
         used.add(item)
     unchecked = used
     while unchecked:
         new = set()
         for item in unchecked:
             if (item.media_type in OEB_DOCS or
                 item.media_type[-4:] in ('/xml', '+xml')) and \
                item.data is not None:
                 hrefs = [r[2] for r in iterlinks(item.data)]
                 for href in hrefs:
                     if isinstance(href, bytes):
                         href = href.decode('utf-8')
                     try:
                         href = item.abshref(urlnormalize(href))
                     except:
                         continue
                     if href in oeb.manifest.hrefs:
                         found = oeb.manifest.hrefs[href]
                         if found not in used:
                             new.add(found)
             elif item.media_type == CSS_MIME:
                 for href in css_parser.getUrls(item.data):
                     href = item.abshref(urlnormalize(href))
                     if href in oeb.manifest.hrefs:
                         found = oeb.manifest.hrefs[href]
                         if found not in used:
                             new.add(found)
         used.update(new)
         unchecked = new
     for item in oeb.manifest.values():
         if item not in used:
             oeb.logger.info('Trimming %r from manifest' % item.href)
             oeb.manifest.remove(item)

Ejemplo n.º 14

0

Mostrar archivo

Archivo: filenames.py Proyecto: JimmXinu/calibre

    def fix_toc_entry(self, toc):
        if toc.href:
            href = urlnormalize(toc.href)
            href, frag = urldefrag(href)
            replacement = self.rename_map.get(href, None)

            if replacement is not None:
                nhref = replacement
                if frag:
                    nhref = '#'.join((nhref, frag))
                toc.href = nhref

        for x in toc:
            self.fix_toc_entry(x)

Ejemplo n.º 15

0

Mostrar archivo

Archivo: trimmanifest.py Proyecto: JimmXinu/calibre

 def __call__(self, oeb, context):
     import css_parser
     oeb.logger.info('Trimming unused files from manifest...')
     self.opts = context
     used = set()
     for term in oeb.metadata:
         for item in oeb.metadata[term]:
             if item.value in oeb.manifest.hrefs:
                 used.add(oeb.manifest.hrefs[item.value])
             elif item.value in oeb.manifest.ids:
                 used.add(oeb.manifest.ids[item.value])
     for ref in oeb.guide.values():
         path, _ = urldefrag(ref.href)
         if path in oeb.manifest.hrefs:
             used.add(oeb.manifest.hrefs[path])
     # TOC items are required to be in the spine
     for item in oeb.spine:
         used.add(item)
     unchecked = used
     while unchecked:
         new = set()
         for item in unchecked:
             if (item.media_type in OEB_DOCS or
                 item.media_type[-4:] in ('/xml', '+xml')) and \
                item.data is not None:
                 hrefs = [r[2] for r in iterlinks(item.data)]
                 for href in hrefs:
                     if isinstance(href, bytes):
                         href = href.decode('utf-8')
                     try:
                         href = item.abshref(urlnormalize(href))
                     except:
                         continue
                     if href in oeb.manifest.hrefs:
                         found = oeb.manifest.hrefs[href]
                         if found not in used:
                             new.add(found)
             elif item.media_type == CSS_MIME:
                 for href in css_parser.getUrls(item.data):
                     href = item.abshref(urlnormalize(href))
                     if href in oeb.manifest.hrefs:
                         found = oeb.manifest.hrefs[href]
                         if found not in used:
                             new.add(found)
         used.update(new)
         unchecked = new
     for item in oeb.manifest.values():
         if item not in used:
             oeb.logger.info('Trimming %r from manifest' % item.href)
             oeb.manifest.remove(item)

Ejemplo n.º 16

0

Mostrar archivo

    def fix_toc_entry(self, toc):
        if toc.href:
            href = urlnormalize(toc.href)
            href, frag = urldefrag(href)
            replacement = self.rename_map.get(href, None)

            if replacement is not None:
                nhref = replacement
                if frag:
                    nhref = '#'.join((nhref, frag))
                toc.href = nhref

        for x in toc:
            self.fix_toc_entry(x)

Ejemplo n.º 17

0

Mostrar archivo

Archivo: filenames.py Proyecto: JimmXinu/calibre

    def url_replacer(self, orig_url):
        url = urlnormalize(orig_url)
        parts = urlparse(url)
        if parts.scheme:
            # Only rewrite local URLs
            return orig_url
        path, frag = urldefrag(url)
        if self.renamed_items_map:
            orig_item = self.renamed_items_map.get(self.current_item.href, self.current_item)
        else:
            orig_item = self.current_item

        href = orig_item.abshref(path)
        replacement = self.current_item.relhref(self.rename_map.get(href, href))
        if frag:
            replacement += '#' + frag
        return replacement

Ejemplo n.º 18

0

Mostrar archivo

    def url_replacer(self, orig_url):
        url = urlnormalize(orig_url)
        parts = urlparse(url)
        if parts.scheme:
            # Only rewrite local URLs
            return orig_url
        path, frag = urldefrag(url)
        if self.renamed_items_map:
            orig_item = self.renamed_items_map.get(self.current_item.href,
                                                   self.current_item)
        else:
            orig_item = self.current_item

        href = orig_item.abshref(path)
        replacement = self.current_item.relhref(self.rename_map.get(
            href, href))
        if frag:
            replacement += '#' + frag
        return replacement

Ejemplo n.º 19

0

Mostrar archivo

Archivo: reader.py Proyecto: j-howell/calibre

 def _guide_from_opf(self, opf):
     guide = self.oeb.guide
     manifest = self.oeb.manifest
     for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'):
         ref_href = elem.get('href')
         path = urlnormalize(urldefrag(ref_href)[0])
         if path not in manifest.hrefs:
             corrected_href = None
             for href in manifest.hrefs:
                 if href.lower() == path.lower():
                     corrected_href = href
                     break
             if corrected_href is None:
                 self.logger.warn(u'Guide reference %r not found' % ref_href)
                 continue
             ref_href = corrected_href
         typ = elem.get('type')
         if typ not in guide:
             guide.add(typ, elem.get('title'), ref_href)

Ejemplo n.º 20

0

Mostrar archivo

Archivo: reader.py Proyecto: jimman2003/calibre

 def _guide_from_opf(self, opf):
     guide = self.oeb.guide
     manifest = self.oeb.manifest
     for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'):
         ref_href = elem.get('href')
         path = urlnormalize(urldefrag(ref_href)[0])
         if path not in manifest.hrefs:
             corrected_href = None
             for href in manifest.hrefs:
                 if href.lower() == path.lower():
                     corrected_href = href
                     break
             if corrected_href is None:
                 self.logger.warn('Guide reference %r not found' % ref_href)
                 continue
             ref_href = corrected_href
         typ = elem.get('type')
         if typ not in guide:
             guide.add(typ, elem.get('title'), ref_href)

Ejemplo n.º 21

0

Mostrar archivo

Archivo: rasterize.py Proyecto: JimmXinu/calibre

 def dataize_svg(self, item, svg=None):
     if svg is None:
         svg = item.data
     hrefs = self.oeb.manifest.hrefs
     for elem in xpath(svg, '//svg:*[@xl:href]'):
         href = urlnormalize(elem.attrib[XLINK('href')])
         path = urldefrag(href)[0]
         if not path:
             continue
         abshref = item.abshref(path)
         if abshref not in hrefs:
             continue
         linkee = hrefs[abshref]
         data = str(linkee)
         ext = what(None, data) or 'jpg'
         with PersistentTemporaryFile(suffix='.'+ext) as pt:
             pt.write(data)
             self.temp_files.append(pt.name)
         elem.attrib[XLINK('href')] = pt.name
     return svg

Ejemplo n.º 22

0

Mostrar archivo

Archivo: rasterize.py Proyecto: zwlistu/calibre

 def dataize_svg(self, item, svg=None):
     if svg is None:
         svg = item.data
     hrefs = self.oeb.manifest.hrefs
     for elem in xpath(svg, '//svg:*[@xl:href]'):
         href = urlnormalize(elem.attrib[XLINK('href')])
         path = urldefrag(href)[0]
         if not path:
             continue
         abshref = item.abshref(path)
         if abshref not in hrefs:
             continue
         linkee = hrefs[abshref]
         data = str(linkee)
         ext = what(None, data) or 'jpg'
         with PersistentTemporaryFile(suffix='.' + ext) as pt:
             pt.write(data)
             self.temp_files.append(pt.name)
         elem.attrib[XLINK('href')] = pt.name
     return svg

Ejemplo n.º 23

0

Mostrar archivo

Archivo: reader.py Proyecto: j-howell/calibre

 def _toc_from_tour(self, opf):
     result = xpath(opf, 'o2:tours/o2:tour')
     if not result:
         return False
     self.log.debug('Reading TOC from tour...')
     tour = result[0]
     toc = self.oeb.toc
     toc.title = tour.get('title')
     sites = xpath(tour, 'o2:site')
     for site in sites:
         title = site.get('title')
         href = site.get('href')
         if not title or not href:
             continue
         path, _ = urldefrag(urlnormalize(href))
         if path not in self.oeb.manifest.hrefs:
             self.logger.warn('TOC reference %r not found' % href)
             continue
         id = site.get('id')
         toc.add(title, href, id=id)
     return True

Ejemplo n.º 24

0

Mostrar archivo

Archivo: reader.py Proyecto: jimman2003/calibre

 def _toc_from_tour(self, opf):
     result = xpath(opf, 'o2:tours/o2:tour')
     if not result:
         return False
     self.log.debug('Reading TOC from tour...')
     tour = result[0]
     toc = self.oeb.toc
     toc.title = tour.get('title')
     sites = xpath(tour, 'o2:site')
     for site in sites:
         title = site.get('title')
         href = site.get('href')
         if not title or not href:
             continue
         path, _ = urldefrag(urlnormalize(href))
         if path not in self.oeb.manifest.hrefs:
             self.logger.warn('TOC reference %r not found' % href)
             continue
         id = site.get('id')
         toc.add(title, href, id=id)
     return True

Ejemplo n.º 25

0

Mostrar archivo

Archivo: serializer.py Proyecto: smdx023/calibre

 def fixup_links(self):
     '''
     Fill in the correct values for all filepos="..." links with the offsets
     of the linked to content (as stored in id_offsets).
     '''
     buf = self.buf
     id_offsets = self.id_offsets
     start_href = getattr(self, '_start_href', None)
     for href, hoffs in self.href_offsets.items():
         is_start = (href and href == start_href)
         # Iterate over all filepos items
         if href not in id_offsets:
             self.logger.warn('Hyperlink target %r not found' % href)
             # Link to the top of the document, better than just ignoring
             href, _ = urldefrag(href)
         if href in self.id_offsets:
             ioff = self.id_offsets[href]
             if is_start:
                 self.start_offset = ioff
             for hoff in hoffs:
                 buf.seek(hoff)
                 buf.write(('%010d' % ioff).encode('utf-8'))

Ejemplo n.º 26

0

Mostrar archivo

Archivo: serializer.py Proyecto: JimmXinu/calibre

 def fixup_links(self):
     '''
     Fill in the correct values for all filepos="..." links with the offsets
     of the linked to content (as stored in id_offsets).
     '''
     buf = self.buf
     id_offsets = self.id_offsets
     start_href = getattr(self, '_start_href', None)
     for href, hoffs in self.href_offsets.items():
         is_start = (href and href == start_href)
         # Iterate over all filepos items
         if href not in id_offsets:
             self.logger.warn('Hyperlink target %r not found' % href)
             # Link to the top of the document, better than just ignoring
             href, _ = urldefrag(href)
         if href in self.id_offsets:
             ioff = self.id_offsets[href]
             if is_start:
                 self.start_offset = ioff
             for hoff in hoffs:
                 buf.seek(hoff)
                 buf.write(('%010d' % ioff).encode('utf-8'))

Ejemplo n.º 27

0

Mostrar archivo

Archivo: reader.py Proyecto: jimman2003/calibre

 def _spine_add_extra(self):
     manifest = self.oeb.manifest
     spine = self.oeb.spine
     unchecked = set(spine)
     selector = XPath('h:body//h:a/@href')
     extras = set()
     while unchecked:
         new = set()
         for item in unchecked:
             if item.media_type not in OEB_DOCS:
                 # TODO: handle fallback chains
                 continue
             for href in selector(item.data):
                 href, _ = urldefrag(href)
                 if not href:
                     continue
                 try:
                     href = item.abshref(urlnormalize(href))
                 except ValueError:  # Malformed URL
                     continue
                 if href not in manifest.hrefs:
                     continue
                 found = manifest.hrefs[href]
                 if found.media_type not in OEB_DOCS or \
                    found in spine or found in extras:
                     continue
                 new.add(found)
         extras.update(new)
         unchecked = new
     version = int(self.oeb.version[0])
     removed_items_to_ignore = getattr(self.oeb, 'removed_items_to_ignore',
                                       ())
     for item in extras:
         if item.href in removed_items_to_ignore:
             continue
         if version >= 2:
             self.logger.warn('Spine-referenced file %r not in spine' %
                              item.href)
         spine.add(item, linear=False)

Ejemplo n.º 28

0

Mostrar archivo

Archivo: reader.py Proyecto: j-howell/calibre

 def _spine_add_extra(self):
     manifest = self.oeb.manifest
     spine = self.oeb.spine
     unchecked = set(spine)
     selector = XPath('h:body//h:a/@href')
     extras = set()
     while unchecked:
         new = set()
         for item in unchecked:
             if item.media_type not in OEB_DOCS:
                 # TODO: handle fallback chains
                 continue
             for href in selector(item.data):
                 href, _ = urldefrag(href)
                 if not href:
                     continue
                 try:
                     href = item.abshref(urlnormalize(href))
                 except ValueError:  # Malformed URL
                     continue
                 if href not in manifest.hrefs:
                     continue
                 found = manifest.hrefs[href]
                 if found.media_type not in OEB_DOCS or \
                    found in spine or found in extras:
                     continue
                 new.add(found)
         extras.update(new)
         unchecked = new
     version = int(self.oeb.version[0])
     removed_items_to_ignore = getattr(self.oeb, 'removed_items_to_ignore', ())
     for item in sorted(extras):
         if item.href in removed_items_to_ignore:
             continue
         if version >= 2:
             self.logger.warn(
                 'Spine-referenced file %r not in spine' % item.href)
         spine.add(item, linear=False)

Ejemplo n.º 29

0

Mostrar archivo

Archivo: reader.py Proyecto: jimman2003/calibre

    def _manifest_add_missing(self, invalid):
        import css_parser
        manifest = self.oeb.manifest
        known = set(manifest.hrefs)
        unchecked = set(manifest.values())
        cdoc = OEB_DOCS | OEB_STYLES
        invalid = set()
        while unchecked:
            new = set()
            for item in unchecked:
                data = None
                if (item.media_type in cdoc
                        or item.media_type[-4:] in ('/xml', '+xml')):
                    try:
                        data = item.data
                    except:
                        self.oeb.log.exception('Failed to read from manifest '
                                               'entry with id: %s, ignoring' %
                                               item.id)
                        invalid.add(item)
                        continue
                if data is None:
                    continue

                if (item.media_type in OEB_DOCS
                        or item.media_type[-4:] in ('/xml', '+xml')):
                    hrefs = [r[2] for r in iterlinks(data)]
                    for href in hrefs:
                        if isinstance(href, bytes):
                            href = href.decode('utf-8')
                        href, _ = urldefrag(href)
                        if not href:
                            continue
                        try:
                            href = item.abshref(urlnormalize(href))
                            scheme = urlparse(href).scheme
                        except:
                            self.oeb.log.exception(
                                'Skipping invalid href: %r' % href)
                            continue
                        if not scheme and href not in known:
                            new.add(href)
                elif item.media_type in OEB_STYLES:
                    try:
                        urls = list(css_parser.getUrls(data))
                    except:
                        urls = []
                    for url in urls:
                        href, _ = urldefrag(url)
                        href = item.abshref(urlnormalize(href))
                        scheme = urlparse(href).scheme
                        if not scheme and href not in known:
                            new.add(href)
            unchecked.clear()
            warned = set()
            for href in new:
                known.add(href)
                is_invalid = False
                for item in invalid:
                    if href == item.abshref(urlnormalize(href)):
                        is_invalid = True
                        break
                if is_invalid:
                    continue
                if not self.oeb.container.exists(href):
                    if href not in warned:
                        self.logger.warn('Referenced file %r not found' % href)
                        warned.add(href)
                    continue
                if href not in warned:
                    self.logger.warn('Referenced file %r not in manifest' %
                                     href)
                    warned.add(href)
                id, _ = manifest.generate(id='added')
                guessed = guess_type(href)[0]
                media_type = guessed or BINARY_MIME
                added = manifest.add(id, href, media_type)
                unchecked.add(added)

            for item in invalid:
                self.oeb.manifest.remove(item)

Ejemplo n.º 30

0

Mostrar archivo

Archivo: reader.py Proyecto: j-howell/calibre

    def _manifest_add_missing(self, invalid):
        import css_parser
        manifest = self.oeb.manifest
        known = set(manifest.hrefs)
        unchecked = set(manifest.values())
        cdoc = OEB_DOCS|OEB_STYLES
        invalid = set()
        while unchecked:
            new = set()
            for item in unchecked:
                data = None
                if (item.media_type in cdoc or item.media_type[-4:] in ('/xml', '+xml')):
                    try:
                        data = item.data
                    except:
                        self.oeb.log.exception(u'Failed to read from manifest '
                                u'entry with id: %s, ignoring'%item.id)
                        invalid.add(item)
                        continue
                if data is None:
                    continue

                if (item.media_type in OEB_DOCS or item.media_type[-4:] in ('/xml', '+xml')):
                    hrefs = [r[2] for r in iterlinks(data)]
                    for href in hrefs:
                        if isinstance(href, bytes):
                            href = href.decode('utf-8')
                        href, _ = urldefrag(href)
                        if not href:
                            continue
                        try:
                            href = item.abshref(urlnormalize(href))
                            scheme = urlparse(href).scheme
                        except:
                            self.oeb.log.exception(
                                'Skipping invalid href: %r'%href)
                            continue
                        if not scheme and href not in known:
                            new.add(href)
                elif item.media_type in OEB_STYLES:
                    try:
                        urls = list(css_parser.getUrls(data))
                    except:
                        urls = []
                    for url in urls:
                        href, _ = urldefrag(url)
                        href = item.abshref(urlnormalize(href))
                        scheme = urlparse(href).scheme
                        if not scheme and href not in known:
                            new.add(href)
            unchecked.clear()
            warned = set([])
            for href in new:
                known.add(href)
                is_invalid = False
                for item in invalid:
                    if href == item.abshref(urlnormalize(href)):
                        is_invalid = True
                        break
                if is_invalid:
                    continue
                if not self.oeb.container.exists(href):
                    if href not in warned:
                        self.logger.warn('Referenced file %r not found' % href)
                        warned.add(href)
                    continue
                if href not in warned:
                    self.logger.warn('Referenced file %r not in manifest' % href)
                    warned.add(href)
                id, _ = manifest.generate(id='added')
                guessed = guess_type(href)[0]
                media_type = guessed or BINARY_MIME
                added = manifest.add(id, href, media_type)
                unchecked.add(added)

            for item in invalid:
                self.oeb.manifest.remove(item)

Ejemplo n.º 31

0

Mostrar archivo

Archivo: serializer.py Proyecto: smdx023/calibre

 def spine_item(tocitem):
     href = urldefrag(tocitem.href)[0]
     for item in self.oeb.spine:
         if item.href == href:
             return item

Ejemplo n.º 32

0

Mostrar archivo

Archivo: mobi8.py Proyecto: smdx023/calibre

    def read_inline_toc(self, href, frag):
        ans = TOC()
        base_href = '/'.join(href.split('/')[:-1])
        with open(href.replace('/', os.sep), 'rb') as f:
            raw = f.read().decode(self.header.codec)
        root = parse_html(raw, log=self.log)
        body = XPath('//h:body')(root)
        reached = False
        if body:
            start = body[0]
        else:
            start = None
            reached = True
        if frag:
            elems = XPath('//*[@id="%s"]' % frag)(root)
            if elems:
                start = elems[0]

        def node_depth(elem):
            ans = 0
            parent = elem.getparent()
            while parent is not None:
                parent = parent.getparent()
                ans += 1
            return ans

        # Layer the ToC based on nesting order in the source HTML
        current_depth = None
        parent = ans
        seen = set()
        links = []
        for elem in root.iterdescendants(etree.Element):
            if reached and elem.tag == XHTML('a') and elem.get('href', False):
                href = elem.get('href')
                href, frag = urldefrag(href)
                href = base_href + '/' + href
                text = xml2text(elem).strip()
                if (text, href, frag) in seen:
                    continue
                seen.add((text, href, frag))
                links.append((text, href, frag, node_depth(elem)))
            elif elem is start:
                reached = True

        depths = sorted({x[-1] for x in links})
        depth_map = {x: i for i, x in enumerate(depths)}
        for text, href, frag, depth in links:
            depth = depth_map[depth]
            if current_depth is None:
                current_depth = 0
                parent.add_item(href, frag, text)
            elif current_depth == depth:
                parent.add_item(href, frag, text)
            elif current_depth < depth:
                parent = parent[-1] if len(parent) > 0 else parent
                parent.add_item(href, frag, text)
                current_depth += 1
            else:
                delta = current_depth - depth
                while delta > 0 and parent.parent is not None:
                    parent = parent.parent
                    delta -= 1
                parent.add_item(href, frag, text)
                current_depth = depth
        return ans

Ejemplo n.º 33

0

Mostrar archivo

 def tree_to_binary(self,
                    elem,
                    nsrmap=NSRMAP,
                    parents=[],
                    inhead=False,
                    preserve=False):
     if not isinstance(elem.tag, string_or_bytes):
         # Don't emit any comments or raw entities
         return
     nsrmap = copy.copy(nsrmap)
     attrib = dict(elem.attrib)
     style = self.stylizer.style(elem) if self.stylizer else None
     for key, value in elem.nsmap.items():
         if value not in nsrmap or nsrmap[value] != key:
             xmlns = ('xmlns:' + key) if key else 'xmlns'
             attrib[xmlns] = value
         nsrmap[value] = key
     tag = prefixname(elem.tag, nsrmap)
     tag_offset = self.buf.tell()
     if tag == 'head':
         inhead = True
     flags = FLAG_OPENING
     if not elem.text and len(elem) == 0:
         flags |= FLAG_CLOSING
     if inhead:
         flags |= FLAG_HEAD
     if style and self.is_block(style):
         flags |= FLAG_BLOCK
     self.write(0, flags)
     tattrs = self.tattrs[0]
     if tag in self.tags:
         index = self.tags[tag]
         self.write(index)
         if self.tattrs[index]:
             tattrs = self.tattrs[index]
     else:
         self.write(FLAG_CUSTOM, len(tag) + 1, tag)
     last_break = self.page_breaks[-1][0] if self.page_breaks else None
     if style and last_break != tag_offset \
        and style['page-break-before'] in PAGE_BREAKS:
         self.page_breaks.append((tag_offset, list(parents)))
     for attr, value in attrib.items():
         attr = prefixname(attr, nsrmap)
         if attr in ('href', 'src'):
             value = urlnormalize(value)
             path, frag = urldefrag(value)
             if self.item:
                 path = self.item.abshref(path)
             prefix = codepoint_to_chr(3)
             if path in self.manifest.hrefs:
                 prefix = codepoint_to_chr(2)
                 value = self.manifest.hrefs[path].id
                 if frag:
                     value = '#'.join((value, frag))
             value = prefix + value
         elif attr in ('id', 'name'):
             self.anchors.append((value, tag_offset))
         elif attr.startswith('ms--'):
             attr = '%' + attr[4:]
         elif tag == 'link' and attr == 'type' and value in OEB_STYLES:
             value = CSS_MIME
         if attr in tattrs:
             self.write(tattrs[attr])
         else:
             self.write(FLAG_CUSTOM, len(attr) + 1, attr)
         try:
             self.write(ATTR_NUMBER, int(value) + 1)
         except ValueError:
             self.write(len(value) + 1, value)
     self.write(0)
     old_preserve = preserve
     if style:
         preserve = (style['white-space'] in ('pre', 'pre-wrap'))
     xml_space = elem.get(XML('space'))
     if xml_space == 'preserve':
         preserve = True
     elif xml_space == 'normal':
         preserve = False
     if elem.text:
         if preserve:
             self.write(elem.text)
         elif len(elem) == 0 or not elem.text.isspace():
             self.write(COLLAPSE.sub(' ', elem.text))
         # else: de nada
     parents.append(tag_offset)
     child = cstyle = nstyle = None
     for next in chain(elem, [None]):
         if self.stylizer:
             nstyle = None if next is None else self.stylizer.style(next)
         if child is not None:
             if not preserve \
                and (inhead or not nstyle or self.is_block(cstyle) or self.is_block(nstyle)) \
                and child.tail and child.tail.isspace():
                 child.tail = None
             self.tree_to_binary(child, nsrmap, parents, inhead, preserve)
         child, cstyle = next, nstyle
     parents.pop()
     preserve = old_preserve
     if not flags & FLAG_CLOSING:
         self.write(0, (flags & ~FLAG_OPENING) | FLAG_CLOSING, 0)
     if elem.tail and tag != 'html':
         tail = elem.tail
         if not preserve:
             tail = COLLAPSE.sub(' ', tail)
         self.write(tail)
     if style and style['page-break-after'] not in ('avoid', 'auto'):
         self.page_breaks.append((self.buf.tell(), list(parents)))

Ejemplo n.º 34

0

Mostrar archivo

Archivo: reader.py Proyecto: jimman2003/calibre

    def binary_to_text_inner(self, bin, buf, stack):
        (depth, tag_name, current_map, dynamic_tag, errors, in_censorship,
         is_goingdown, state, flags) = stack.pop()

        if state == 'close tag':
            if not tag_name:
                raise LitError('Tag ends before it begins.')
            buf.write(encode(''.join(('</', tag_name, '>'))))
            dynamic_tag = 0
            tag_name = None
            state = 'text'

        while self.cpos < len(bin):
            c, self.cpos = read_utf8_char(bin, self.cpos)
            oc = ord(c)

            if state == 'text':
                if oc == 0:
                    state = 'get flags'
                    continue
                elif c == '\v':
                    c = '\n'
                elif c == '>':
                    c = '>>'
                elif c == '<':
                    c = '<<'
                buf.write(encode(c))

            elif state == 'get flags':
                if oc == 0:
                    state = 'text'
                    continue
                flags = oc
                state = 'get tag'

            elif state == 'get tag':
                state = 'text' if oc == 0 else 'get attr'
                if flags & FLAG_OPENING:
                    tag = oc
                    buf.write(b'<')
                    if not (flags & FLAG_CLOSING):
                        is_goingdown = True
                    if tag == 0x8000:
                        state = 'get custom length'
                        continue
                    if flags & FLAG_ATOM:
                        if not self.tag_atoms or tag not in self.tag_atoms:
                            raise LitError("atom tag %d not in atom tag list" %
                                           tag)
                        tag_name = self.tag_atoms[tag]
                        current_map = self.attr_atoms
                    elif tag < len(self.tag_map):
                        tag_name = self.tag_map[tag]
                        current_map = self.tag_to_attr_map[tag]
                    else:
                        dynamic_tag += 1
                        errors += 1
                        tag_name = '?' + codepoint_to_chr(tag) + '?'
                        current_map = self.tag_to_attr_map[tag]
                        print('WARNING: tag %s unknown' %
                              codepoint_to_chr(tag))
                    buf.write(encode(tag_name))
                elif flags & FLAG_CLOSING:
                    if depth == 0:
                        raise LitError('Extra closing tag %s at %d' %
                                       (tag_name, self.cpos))
                    break

            elif state == 'get attr':
                in_censorship = False
                if oc == 0:
                    state = 'text'
                    if not is_goingdown:
                        tag_name = None
                        dynamic_tag = 0
                        buf.write(b' />')
                    else:
                        buf.write(b'>')
                        frame = (depth, tag_name, current_map, dynamic_tag,
                                 errors, in_censorship, False, 'close tag',
                                 flags)
                        stack.append(frame)
                        frame = (depth + 1, None, None, 0, 0, False, False,
                                 'text', 0)
                        stack.append(frame)
                        break
                else:
                    if oc == 0x8000:
                        state = 'get attr length'
                        continue
                    attr = None
                    if current_map and oc in current_map and current_map[oc]:
                        attr = current_map[oc]
                    elif oc in self.attr_map:
                        attr = self.attr_map[oc]
                    if not attr or not isinstance(attr, string_or_bytes):
                        raise LitError('Unknown attribute %d in tag %s' %
                                       (oc, tag_name))
                    if attr.startswith('%'):
                        in_censorship = True
                        state = 'get value length'
                        continue
                    buf.write(b' ' + encode(attr) + b'=')
                    if attr in ['href', 'src']:
                        state = 'get href length'
                    else:
                        state = 'get value length'

            elif state == 'get value length':
                if not in_censorship:
                    buf.write(b'"')
                count = oc - 1
                if count == 0:
                    if not in_censorship:
                        buf.write(b'"')
                    in_censorship = False
                    state = 'get attr'
                    continue
                state = 'get value'
                if oc == 0xffff:
                    continue
                if count < 0 or count > (len(bin) - self.cpos):
                    raise LitError('Invalid character count %d' % count)

            elif state == 'get value':
                if count == 0xfffe:
                    if not in_censorship:
                        buf.write(encode('%s"' % (oc - 1)))
                    in_censorship = False
                    state = 'get attr'
                elif count > 0:
                    if not in_censorship:
                        if c == '"':
                            c = '&quot;'
                        elif c == '<':
                            c = '&lt;'
                        if isinstance(c, unicode_type):
                            c = c.encode('ascii', 'xmlcharrefreplace')
                        buf.write(c)
                    count -= 1
                if count == 0:
                    if not in_censorship:
                        buf.write(b'"')
                    in_censorship = False
                    state = 'get attr'

            elif state == 'get custom length':
                count = oc - 1
                if count <= 0 or count > len(bin) - self.cpos:
                    raise LitError('Invalid character count %d' % count)
                dynamic_tag += 1
                state = 'get custom'
                tag_name = ''

            elif state == 'get custom':
                tag_name += c
                count -= 1
                if count == 0:
                    buf.write(encode(tag_name))
                    state = 'get attr'

            elif state == 'get attr length':
                count = oc - 1
                if count <= 0 or count > (len(bin) - self.cpos):
                    raise LitError('Invalid character count %d' % count)
                buf.write(b' ')
                state = 'get custom attr'

            elif state == 'get custom attr':
                buf.write(encode(c))
                count -= 1
                if count == 0:
                    buf.write(b'=')
                    state = 'get value length'

            elif state == 'get href length':
                count = oc - 1
                if count <= 0 or count > (len(bin) - self.cpos):
                    raise LitError('Invalid character count %d' % count)
                href = ''
                state = 'get href'

            elif state == 'get href':
                href += c
                count -= 1
                if count == 0:
                    doc, frag = urldefrag(href[1:])
                    path = self.item_path(doc)
                    if frag:
                        path = '#'.join((path, frag))
                    path = urlnormalize(path)
                    buf.write(encode('"%s"' % path))
                    state = 'get attr'

Ejemplo n.º 35

0

Mostrar archivo

Archivo: reader.py Proyecto: JimmXinu/calibre

    def binary_to_text_inner(self, bin, buf, stack):
        (depth, tag_name, current_map, dynamic_tag, errors,
                in_censorship, is_goingdown, state, flags) = stack.pop()

        if state == 'close tag':
            if not tag_name:
                raise LitError('Tag ends before it begins.')
            buf.write(encode(u''.join(('</', tag_name, '>'))))
            dynamic_tag = 0
            tag_name = None
            state = 'text'

        while self.cpos < len(bin):
            c, self.cpos = read_utf8_char(bin, self.cpos)
            oc = ord(c)

            if state == 'text':
                if oc == 0:
                    state = 'get flags'
                    continue
                elif c == '\v':
                    c = '\n'
                elif c == '>':
                    c = '>>'
                elif c == '<':
                    c = '<<'
                buf.write(encode(c))

            elif state == 'get flags':
                if oc == 0:
                    state = 'text'
                    continue
                flags = oc
                state = 'get tag'

            elif state == 'get tag':
                state = 'text' if oc == 0 else 'get attr'
                if flags & FLAG_OPENING:
                    tag = oc
                    buf.write(b'<')
                    if not (flags & FLAG_CLOSING):
                        is_goingdown = True
                    if tag == 0x8000:
                        state = 'get custom length'
                        continue
                    if flags & FLAG_ATOM:
                        if not self.tag_atoms or tag not in self.tag_atoms:
                            raise LitError(
                                "atom tag %d not in atom tag list" % tag)
                        tag_name = self.tag_atoms[tag]
                        current_map = self.attr_atoms
                    elif tag < len(self.tag_map):
                        tag_name = self.tag_map[tag]
                        current_map = self.tag_to_attr_map[tag]
                    else:
                        dynamic_tag += 1
                        errors += 1
                        tag_name = '?'+codepoint_to_chr(tag)+'?'
                        current_map = self.tag_to_attr_map[tag]
                        print('WARNING: tag %s unknown' % codepoint_to_chr(tag))
                    buf.write(encode(tag_name))
                elif flags & FLAG_CLOSING:
                    if depth == 0:
                        raise LitError('Extra closing tag %s at %d'%(tag_name,
                            self.cpos))
                    break

            elif state == 'get attr':
                in_censorship = False
                if oc == 0:
                    state = 'text'
                    if not is_goingdown:
                        tag_name = None
                        dynamic_tag = 0
                        buf.write(b' />')
                    else:
                        buf.write(b'>')
                        frame = (depth, tag_name, current_map,
                            dynamic_tag, errors, in_censorship, False,
                            'close tag', flags)
                        stack.append(frame)
                        frame = (depth+1, None, None, 0, 0,
                                False, False, 'text', 0)
                        stack.append(frame)
                        break
                else:
                    if oc == 0x8000:
                        state = 'get attr length'
                        continue
                    attr = None
                    if current_map and oc in current_map and current_map[oc]:
                        attr = current_map[oc]
                    elif oc in self.attr_map:
                        attr = self.attr_map[oc]
                    if not attr or not isinstance(attr, string_or_bytes):
                        raise LitError(
                            'Unknown attribute %d in tag %s' % (oc, tag_name))
                    if attr.startswith('%'):
                        in_censorship = True
                        state = 'get value length'
                        continue
                    buf.write(b' ' + encode(attr) + b'=')
                    if attr in ['href', 'src']:
                        state = 'get href length'
                    else:
                        state = 'get value length'

            elif state == 'get value length':
                if not in_censorship:
                    buf.write(b'"')
                count = oc - 1
                if count == 0:
                    if not in_censorship:
                        buf.write(b'"')
                    in_censorship = False
                    state = 'get attr'
                    continue
                state = 'get value'
                if oc == 0xffff:
                    continue
                if count < 0 or count > (len(bin) - self.cpos):
                    raise LitError('Invalid character count %d' % count)

            elif state == 'get value':
                if count == 0xfffe:
                    if not in_censorship:
                        buf.write(encode('%s"' % (oc - 1)))
                    in_censorship = False
                    state = 'get attr'
                elif count > 0:
                    if not in_censorship:
                        if c == '"':
                            c = '&quot;'
                        elif c == '<':
                            c = '&lt;'
                        if isinstance(c, unicode_type):
                            c = c.encode('ascii', 'xmlcharrefreplace')
                        buf.write(c)
                    count -= 1
                if count == 0:
                    if not in_censorship:
                        buf.write(b'"')
                    in_censorship = False
                    state = 'get attr'

            elif state == 'get custom length':
                count = oc - 1
                if count <= 0 or count > len(bin)-self.cpos:
                    raise LitError('Invalid character count %d' % count)
                dynamic_tag += 1
                state = 'get custom'
                tag_name = ''

            elif state == 'get custom':
                tag_name += c
                count -= 1
                if count == 0:
                    buf.write(encode(tag_name))
                    state = 'get attr'

            elif state == 'get attr length':
                count = oc - 1
                if count <= 0 or count > (len(bin) - self.cpos):
                    raise LitError('Invalid character count %d' % count)
                buf.write(b' ')
                state = 'get custom attr'

            elif state == 'get custom attr':
                buf.write(encode(c))
                count -= 1
                if count == 0:
                    buf.write(b'=')
                    state = 'get value length'

            elif state == 'get href length':
                count = oc - 1
                if count <= 0 or count > (len(bin) - self.cpos):
                    raise LitError('Invalid character count %d' % count)
                href = ''
                state = 'get href'

            elif state == 'get href':
                href += c
                count -= 1
                if count == 0:
                    doc, frag = urldefrag(href[1:])
                    path = self.item_path(doc)
                    if frag:
                        path = '#'.join((path, frag))
                    path = urlnormalize(path)
                    buf.write(encode(u'"%s"' % path))
                    state = 'get attr'

Ejemplo n.º 36

0

Mostrar archivo

Archivo: serializer.py Proyecto: JimmXinu/calibre

 def spine_item(tocitem):
     href = urldefrag(tocitem.href)[0]
     for item in self.oeb.spine:
         if item.href == href:
             return item