Python xml2text Examples, calibre.ebooks.oeb.base.xml2text Python Examples

Example #1

0

Show file

File: toc.py Project: JapaChin/calibre

def commit_toc(container, toc, lang=None, uid=None):
    tocname = find_existing_toc(container)
    if tocname is None:
        item = container.generate_item("toc.ncx", id_prefix="toc")
        tocname = container.href_to_name(item.get("href"), base=container.opf_name)
    if not lang:
        lang = get_lang()
        for l in container.opf_xpath("//dc:language"):
            l = canonicalize_lang(xml2text(l).strip())
            if l:
                lang = l
                lang = lang_as_iso639_1(l) or l
                break
    lang = lang_as_iso639_1(lang) or lang
    if not uid:
        uid = uuid_id()
        eid = container.opf.get("unique-identifier", None)
        if eid:
            m = container.opf_xpath('//*[@id="%s"]' % eid)
            if m:
                uid = xml2text(m[0])

    title = _("Table of Contents")
    m = container.opf_xpath("//dc:title")
    if m:
        x = xml2text(m[0]).strip()
        title = x or title

    to_href = partial(container.name_to_href, base=tocname)
    root = create_ncx(toc, to_href, title, lang, uid)
    container.replace(tocname, root)
    container.pretty_print.add(tocname)

Example #2

0

Show file

def commit_ncx_toc(container, toc, lang=None, uid=None):
    tocname = find_existing_ncx_toc(container)
    if tocname is None:
        item = container.generate_item('toc.ncx', id_prefix='toc')
        tocname = container.href_to_name(item.get('href'), base=container.opf_name)
        ncx_id = item.get('id')
        [s.set('toc', ncx_id) for s in container.opf_xpath('//opf:spine')]
    if not lang:
        lang = get_lang()
        for l in container.opf_xpath('//dc:language'):
            l = canonicalize_lang(xml2text(l).strip())
            if l:
                lang = l
                lang = lang_as_iso639_1(l) or l
                break
    lang = lang_as_iso639_1(lang) or lang
    if not uid:
        uid = uuid_id()
        eid = container.opf.get('unique-identifier', None)
        if eid:
            m = container.opf_xpath('//*[@id="%s"]'%eid)
            if m:
                uid = xml2text(m[0])

    title = _('Table of Contents')
    m = container.opf_xpath('//dc:title')
    if m:
        x = xml2text(m[0]).strip()
        title = x or title

    to_href = partial(container.name_to_href, base=tocname)
    root = create_ncx(toc, to_href, title, lang, uid)
    container.replace(tocname, root)
    container.pretty_print.add(tocname)

Example #3

0

Show file

File: structure.py Project: yeyanchao/calibre

    def detect_chapters(self):
        self.detected_chapters = []

        def find_matches(expr, doc):
            try:
                return XPath(expr)(doc)
            except:
                self.log.warn('Invalid chapter expression, ignoring: %s'%expr)
                return []

        if self.opts.chapter:
            for item in self.oeb.spine:
                for x in find_matches(self.opts.chapter, item.data):
                    self.detected_chapters.append((item, x))

            chapter_mark = self.opts.chapter_mark
            page_break_before = 'display: block; page-break-before: always'
            page_break_after = 'display: block; page-break-after: always'
            for item, elem in self.detected_chapters:
                text = xml2text(elem).strip()
                text = re.sub(r'\s+', ' ', text.strip())
                self.log('\tDetected chapter:', text[:50])
                if chapter_mark == 'none':
                    continue
                elif chapter_mark == 'rule':
                    mark = etree.Element(XHTML('hr'))
                elif chapter_mark == 'pagebreak':
                    mark = etree.Element(XHTML('div'), style=page_break_after)
                else: # chapter_mark == 'both':
                    mark = etree.Element(XHTML('hr'), style=page_break_before)
                try:
                    elem.addprevious(mark)
                except TypeError:
                    self.log.exception('Failed to mark chapter')

Example #4

0

Show file

File: jacket.py Project: JimmXinu/calibre

 def remove_first_image(self):
     deleted_item = None
     for item in self.oeb.spine:
         if XPath(JACKET_XPATH)(item.data):
             continue
         removed = self.remove_images(item)
         if removed > 0:
             self.log('Removed first image')
             body = XPath('//h:body')(item.data)
             if body:
                 raw = xml2text(body[0]).strip()
                 imgs = XPath('//h:img|//svg:svg')(item.data)
                 if not raw and not imgs:
                     self.log('Removing %s as it has no content'%item.href)
                     self.oeb.manifest.remove(item)
                     deleted_item = item
             break
     else:
         self.log.warn('Could not find first image to remove')
     if deleted_item is not None:
         for item in list(self.oeb.toc):
             href = urldefrag(item.href)[0]
             if href == deleted_item.href:
                 self.oeb.toc.remove(item)
         self.oeb.guide.remove_by_href(deleted_item.href)

Example #5

0

Show file

File: structure.py Project: JimmXinu/calibre

 def create_toc_from_links(self):
     num = 0
     for item in self.oeb.spine:
         for a in XPath('//h:a[@href]')(item.data):
             href = a.get('href')
             try:
                 purl = urlparse(href)
             except ValueError:
                 self.log.warning('Ignoring malformed URL:', href)
                 continue
             if not purl[0] or purl[0] == 'file':
                 href, frag = purl.path, purl.fragment
                 href = item.abshref(href)
                 if frag:
                     href = '#'.join((href, frag))
                 if not self.oeb.toc.has_href(href):
                     text = xml2text(a)
                     text = text[:100].strip()
                     if (not self.opts.duplicate_links_in_toc and
                             self.oeb.toc.has_text(text)):
                         continue
                     try:
                         self.oeb.toc.add(text, href,
                             play_order=self.oeb.toc.next_play_order())
                         num += 1
                     except ValueError:
                         self.oeb.log.exception('Failed to process link: %r' % href)
                         continue  # Most likely an incorrectly URL encoded link
                     if self.opts.max_toc_links > 0 and \
                             num >= self.opts.max_toc_links:
                         self.log('Maximum TOC links reached, stopping.')
                         return

Example #6

0

Show file

File: reader.py Project: mihailim/calibre

 def _toc_from_html(self, opf):
     if 'toc' not in self.oeb.guide:
         return False
     self.log.debug('Reading TOC from HTML...')
     itempath, frag = urldefrag(self.oeb.guide['toc'].href)
     item = self.oeb.manifest.hrefs[itempath]
     html = item.data
     if frag:
         elems = xpath(html, './/*[@id="%s"]' % frag)
         if not elems:
             elems = xpath(html, './/*[@name="%s"]' % frag)
         elem = elems[0] if elems else html
         while elem != html and not xpath(elem, './/h:a[@href]'):
             elem = elem.getparent()
         html = elem
     titles = defaultdict(list)
     order = []
     for anchor in xpath(html, './/h:a[@href]'):
         href = anchor.attrib['href']
         href = item.abshref(urlnormalize(href))
         path, frag = urldefrag(href)
         if path not in self.oeb.manifest.hrefs:
             continue
         title = xml2text(anchor)
         title = COLLAPSE_RE.sub(' ', title.strip())
         if href not in titles:
             order.append(href)
         titles[href].append(title)
     toc = self.oeb.toc
     for href in order:
         toc.add(' '.join(titles[href]), href)
     return True

Example #7

0

Show file

File: structure.py Project: Eksmo/calibre

    def detect_chapters(self):
        self.detected_chapters = []
        if self.opts.chapter:
            chapter_xpath = XPath(self.opts.chapter)
            for item in self.oeb.spine:
                for x in chapter_xpath(item.data):
                    self.detected_chapters.append((item, x))

            chapter_mark = self.opts.chapter_mark
            page_break_before = 'display: block; page-break-before: always'
            page_break_after = 'display: block; page-break-after: always'
            for item, elem in self.detected_chapters:
                text = xml2text(elem).strip()
                text = re.sub(r'\s+', ' ', text.strip())
                self.log('\tDetected chapter:', text[:50])
                if chapter_mark == 'none':
                    continue
                elif chapter_mark == 'rule':
                    mark = etree.Element(XHTML('hr'))
                elif chapter_mark == 'pagebreak':
                    mark = etree.Element(XHTML('div'), style=page_break_after)
                else: # chapter_mark == 'both':
                    mark = etree.Element(XHTML('hr'), style=page_break_before)
                try:
                    elem.addprevious(mark)
                except TypeError:
                    self.log.exception('Failed to mark chapter')

Example #8

0

Show file

File: report.py Project: Riva3000/calibre

def description_for_anchor(elem):
    def check(x, min_len=4):
        if x:
            x = x.strip()
            if len(x) >= min_len:
                return x[:30]

    desc = check(elem.get('title'))
    if desc is not None:
        return desc
    desc = check(elem.text)
    if desc is not None:
        return desc
    if len(elem) > 0:
        desc = check(elem[0].text)
        if desc is not None:
            return desc
    # Get full text for tags that have only a few descendants
    for i, x in enumerate(elem.iterdescendants('*')):
        if i > 5:
            break
    else:
        desc = check(xml2text(elem), min_len=1)
        if desc is not None:
            return desc

Example #9

0

Show file

def elem_to_toc_text(elem):
    text = xml2text(elem).strip()
    if not text:
        text = elem.get('title', '')
    if not text:
        text = elem.get('alt', '')
    text = re.sub(r'\s+', ' ', text.strip())
    text = text[:1000].strip()
    if not text:
        text = _('(Untitled)')
    return text

Example #10

0

Show file

File: toc.py Project: JapaChin/calibre

def elem_to_toc_text(elem):
    text = xml2text(elem).strip()
    if not text:
        text = elem.get("title", "")
    if not text:
        text = elem.get("alt", "")
    text = re.sub(r"\s+", " ", text.strip())
    text = text[:1000].strip()
    if not text:
        text = _("(Untitled)")
    return text

Example #11

0

Show file

File: structure.py Project: Eksmo/calibre

 def elem_to_link(self, item, elem, counter):
     text = xml2text(elem).strip()
     if not text:
         text = elem.get('title', '')
     if not text:
         text = elem.get('alt', '')
     text = re.sub(r'\s+', ' ', text.strip())
     text = text[:1000].strip()
     id = elem.get('id', 'calibre_toc_%d'%counter)
     elem.set('id', id)
     href = '#'.join((item.href, id))
     return text, href

Example #12

0

Show file

File: structure.py Project: pra85/calibre

 def elem_to_link(self, item, elem, counter):
     text = xml2text(elem).strip()
     if not text:
         text = elem.get("title", "")
     if not text:
         text = elem.get("alt", "")
     text = re.sub(r"\s+", " ", text.strip())
     text = text[:1000].strip()
     id = elem.get("id", "calibre_toc_%d" % counter)
     elem.set("id", id)
     href = "#".join((item.href, id))
     return text, href

Example #13

0

Show file

File: structure.py Project: JimmXinu/calibre

    def detect_chapters(self):
        self.detected_chapters = []
        self.chapter_title_attribute = None

        def find_matches(expr, doc):
            try:
                ans = XPath(expr)(doc)
                len(ans)
                return ans
            except:
                self.log.warn('Invalid chapter expression, ignoring: %s'%expr)
                return []

        if self.opts.chapter:
            chapter_path, title_attribute = self.get_toc_parts_for_xpath(self.opts.chapter)
            self.chapter_title_attribute = title_attribute
            for item in self.oeb.spine:
                for x in find_matches(chapter_path, item.data):
                    self.detected_chapters.append((item, x))

            chapter_mark = self.opts.chapter_mark
            page_break_before = 'display: block; page-break-before: always'
            page_break_after = 'display: block; page-break-after: always'
            c = Counter()
            for item, elem in self.detected_chapters:
                c[item] += 1
                text = xml2text(elem).strip()
                text = re.sub(r'\s+', ' ', text.strip())
                self.log('\tDetected chapter:', text[:50])
                if chapter_mark == 'none':
                    continue
                if chapter_mark == 'rule':
                    mark = elem.makeelement(XHTML('hr'))
                elif chapter_mark == 'pagebreak':
                    if c[item] < 3 and at_start(elem):
                        # For the first two elements in this item, check if they
                        # are at the start of the file, in which case inserting a
                        # page break in unnecessary and can lead to extra blank
                        # pages in the PDF Output plugin. We need to use two as
                        # feedbooks epubs match both a heading tag and its
                        # containing div with the default chapter expression.
                        continue
                    mark = elem.makeelement(XHTML('div'), style=page_break_after)
                else:  # chapter_mark == 'both':
                    mark = elem.makeelement(XHTML('hr'), style=page_break_before)
                try:
                    elem.addprevious(mark)
                except TypeError:
                    self.log.exception('Failed to mark chapter')

Example #14

0

Show file

    def detect_chapters(self):
        self.detected_chapters = []
        self.chapter_title_attribute = None

        def find_matches(expr, doc):
            try:
                ans = XPath(expr)(doc)
                len(ans)
                return ans
            except:
                self.log.warn('Invalid chapter expression, ignoring: %s'%expr)
                return []

        if self.opts.chapter:
            chapter_path, title_attribute = self.get_toc_parts_for_xpath(self.opts.chapter)
            self.chapter_title_attribute = title_attribute
            for item in self.oeb.spine:
                for x in find_matches(chapter_path, item.data):
                    self.detected_chapters.append((item, x))

            chapter_mark = self.opts.chapter_mark
            page_break_before = 'display: block; page-break-before: always'
            page_break_after = 'display: block; page-break-after: always'
            c = Counter()
            for item, elem in self.detected_chapters:
                c[item] += 1
                text = xml2text(elem).strip()
                text = re.sub(r'\s+', ' ', text.strip())
                self.log('\tDetected chapter:', text[:50])
                if chapter_mark == 'none':
                    continue
                if chapter_mark == 'rule':
                    mark = elem.makeelement(XHTML('hr'))
                elif chapter_mark == 'pagebreak':
                    if c[item] < 3 and at_start(elem):
                        # For the first two elements in this item, check if they
                        # are at the start of the file, in which case inserting a
                        # page break in unnecessary and can lead to extra blank
                        # pages in the PDF Output plugin. We need to use two as
                        # feedbooks epubs match both a heading tag and its
                        # containing div with the default chapter expression.
                        continue
                    mark = elem.makeelement(XHTML('div'), style=page_break_after)
                else:  # chapter_mark == 'both':
                    mark = elem.makeelement(XHTML('hr'), style=page_break_before)
                try:
                    elem.addprevious(mark)
                except TypeError:
                    self.log.exception('Failed to mark chapter')

Example #15

0

Show file

File: toc.py Project: tokot/calibre

def find_text(node):
    LIMIT = 200
    pat = re.compile(r'\s+')
    for child in node:
        if isinstance(child, etree._Element):
            text = xml2text(child).strip()
            text = pat.sub(' ', text)
            if len(text) < 1:
                continue
            if len(text) > LIMIT:
                # Look for less text in a child of this node, recursively
                ntext = find_text(child)
                return ntext or (text[:LIMIT] + '...')
            else:
                return text

Example #16

0

Show file

def find_text(node):
    LIMIT = 200
    pat = re.compile(r'\s+')
    for child in node:
        if isinstance(child, etree._Element):
            text = xml2text(child).strip()
            text = pat.sub(' ', text)
            if len(text) < 1:
                continue
            if len(text) > LIMIT:
                # Look for less text in a child of this node, recursively
                ntext = find_text(child)
                return ntext or (text[:LIMIT] + '...')
            else:
                return text

Example #17

0

Show file

File: structure.py Project: WilliamRJohns/glacier.io

 def elem_to_link(self, item, elem, title_attribute, counter):
     text = ''
     if title_attribute is not None:
         text = elem.get(title_attribute, '')
     if not text:
         text = xml2text(elem).strip()
     if not text:
         text = elem.get('title', '')
     if not text:
         text = elem.get('alt', '')
     text = re.sub(r'\s+', ' ', text.strip())
     text = text[:1000].strip()
     id = elem.get('id', 'calibre_toc_%d' % counter)
     elem.set('id', id)
     href = '#'.join((item.href, id))
     return text, href

Example #18

0

Show file

File: structure.py Project: pra85/calibre

    def detect_chapters(self):
        self.detected_chapters = []

        def find_matches(expr, doc):
            try:
                ans = XPath(expr)(doc)
                len(ans)
                return ans
            except:
                self.log.warn("Invalid chapter expression, ignoring: %s" % expr)
                return []

        if self.opts.chapter:
            for item in self.oeb.spine:
                for x in find_matches(self.opts.chapter, item.data):
                    self.detected_chapters.append((item, x))

            chapter_mark = self.opts.chapter_mark
            page_break_before = "display: block; page-break-before: always"
            page_break_after = "display: block; page-break-after: always"
            c = Counter()
            for item, elem in self.detected_chapters:
                c[item] += 1
                text = xml2text(elem).strip()
                text = re.sub(r"\s+", " ", text.strip())
                self.log("\tDetected chapter:", text[:50])
                if chapter_mark == "none":
                    continue
                if chapter_mark == "rule":
                    mark = etree.Element(XHTML("hr"))
                elif chapter_mark == "pagebreak":
                    if c[item] < 3 and at_start(elem):
                        # For the first two elements in this item, check if they
                        # are at the start of the file, in which case inserting a
                        # page break in unnecessary and can lead to extra blank
                        # pages in the PDF Output plugin. We need to use two as
                        # feedbooks epubs match both a heading tag and its
                        # containing div with the default chapter expression.
                        continue
                    mark = etree.Element(XHTML("div"), style=page_break_after)
                else:  # chapter_mark == 'both':
                    mark = etree.Element(XHTML("hr"), style=page_break_before)
                try:
                    elem.addprevious(mark)
                except TypeError:
                    self.log.exception("Failed to mark chapter")

Example #19

0

Show file

File: cover.py Project: siebert/calibre

def find_cover_image_in_page(container, cover_page):
    root = container.parsed(cover_page)
    body = XPath('//h:body')(root)
    if len(body) != 1:
        return
    body = body[0]
    images = []
    for img in XPath('descendant::h:img[@src]|descendant::svg:svg/descendant::svg:image')(body):
        href = img.get('src') or img.get(XLINK('href'))
        if href:
            name = container.href_to_name(href, base=cover_page)
            images.append(name)
    text = re.sub(r'\s+', '', xml2text(body))
    if text or len(images) > 1:
        # Document has more content than a single image
        return
    if images:
        return images[0]

Example #20

0

Show file

File: cover.py Project: CyberTech/calibre

def find_cover_image_in_page(container, cover_page):
    root = container.parsed(cover_page)
    body = XPath('//h:body')(root)
    if len(body) != 1:
        return
    body = body[0]
    images = []
    for img in XPath('descendant::h:img[@src]|descendant::svg:svg/descendant::svg:image')(body):
        href = img.get('src') or img.get(XLINK('href'))
        if href:
            name = container.href_to_name(href, base=cover_page)
            images.append(name)
    text = re.sub(r'\s+', '', xml2text(body))
    if text or len(images) > 1:
        # Document has more content than a single image
        return
    if images:
        return images[0]

Example #21

0

Show file

File: jacket.py Project: kba/calibre

 def remove_first_image(self):
     deleted_item = None
     for item in self.oeb.spine:
         removed = self.remove_images(item)
         if removed > 0:
             self.log('Removed first image')
             body = XPath('//h:body')(item.data)
             if body:
                 raw = xml2text(body[0]).strip()
                 imgs = XPath('//h:img|//svg:svg')(item.data)
                 if not raw and not imgs:
                     self.log('Removing %s as it has no content'%item.href)
                     self.oeb.manifest.remove(item)
                     deleted_item = item
             break
     if deleted_item is not None:
         for item in list(self.oeb.toc):
             href = urldefrag(item.href)[0]
             if href == deleted_item.href:
                 self.oeb.toc.remove(item)

Example #22

0

Show file

File: jacket.py Project: AtulKumar2/calibre

 def remove_first_image(self):
     deleted_item = None
     for item in self.oeb.spine:
         removed = self.remove_images(item)
         if removed > 0:
             self.log('Removed first image')
             body = XPath('//h:body')(item.data)
             if body:
                 raw = xml2text(body[0]).strip()
                 imgs = XPath('//h:img|//svg:svg')(item.data)
                 if not raw and not imgs:
                     self.log('Removing %s as it has no content'%item.href)
                     self.oeb.manifest.remove(item)
                     deleted_item = item
             break
     if deleted_item is not None:
         for item in list(self.oeb.toc):
             href = urldefrag(item.href)[0]
             if href == deleted_item.href:
                 self.oeb.toc.remove(item)

Example #23

0

Show file

File: structure.py Project: pra85/calibre

 def create_toc_from_links(self):
     num = 0
     for item in self.oeb.spine:
         for a in XPath("//h:a[@href]")(item.data):
             href = a.get("href")
             purl = urlparse(href)
             if not purl[0] or purl[0] == "file":
                 href, frag = purl.path, purl.fragment
                 href = item.abshref(href)
                 if frag:
                     href = "#".join((href, frag))
                 if not self.oeb.toc.has_href(href):
                     text = xml2text(a)
                     text = text[:100].strip()
                     if not self.opts.duplicate_links_in_toc and self.oeb.toc.has_text(text):
                         continue
                     num += 1
                     self.oeb.toc.add(text, href, play_order=self.oeb.toc.next_play_order())
                     if self.opts.max_toc_links > 0 and num >= self.opts.max_toc_links:
                         self.log("Maximum TOC links reached, stopping.")
                         return

Example #24

0

Show file

File: structure.py Project: kobolabs/calibre

    def detect_chapters(self):
        self.detected_chapters = []

        def find_matches(expr, doc):
            try:
                ans = XPath(expr)(doc)
                len(ans)
                return ans
            except:
                self.log.warn('Invalid chapter expression, ignoring: %s'%expr)
                return []

        if self.opts.chapter:
            for item in self.oeb.spine:
                for x in find_matches(self.opts.chapter, item.data):
                    self.detected_chapters.append((item, x))

            chapter_mark = self.opts.chapter_mark
            page_break_before = 'display: block; page-break-before: always'
            page_break_after = 'display: block; page-break-after: always'
            for item, elem in self.detected_chapters:
                text = xml2text(elem).strip()
                text = re.sub(r'\s+', ' ', text.strip())
                self.log('\tDetected chapter:', text[:50])
                if chapter_mark == 'none':
                    continue
                elif chapter_mark == 'rule':
                    mark = etree.Element(XHTML('hr'))
                elif chapter_mark == 'pagebreak':
                    mark = etree.Element(XHTML('div'), style=page_break_after)
                else: # chapter_mark == 'both':
                    mark = etree.Element(XHTML('hr'), style=page_break_before)
                try:
                    elem.addprevious(mark)
                except TypeError:
                    self.log.exception('Failed to mark chapter')

Example #25

0

Show file

File: structure.py Project: sss/calibre

 def create_toc_from_links(self):
     num = 0
     for item in self.oeb.spine:
         for a in XPath('//h:a[@href]')(item.data):
             href = a.get('href')
             purl = urlparse(href)
             if not purl[0] or purl[0] == 'file':
                 href, frag = purl.path, purl.fragment
                 href = item.abshref(href)
                 if frag:
                     href = '#'.join((href, frag))
                 if not self.oeb.toc.has_href(href):
                     text = xml2text(a)
                     text = text[:100].strip()
                     if (not self.opts.duplicate_links_in_toc and
                             self.oeb.toc.has_text(text)):
                         continue
                     num += 1
                     self.oeb.toc.add(text, href,
                         play_order=self.oeb.toc.next_play_order())
                     if self.opts.max_toc_links > 0 and \
                             num >= self.opts.max_toc_links:
                         self.log('Maximum TOC links reached, stopping.')
                         return

Example #26

0

Show file

    def read_inline_toc(self, href, frag):
        ans = TOC()
        base_href = '/'.join(href.split('/')[:-1])
        with open(href.replace('/', os.sep), 'rb') as f:
            raw = f.read().decode(self.header.codec)
        root = parse_html(raw, log=self.log)
        body = XPath('//h:body')(root)
        reached = False
        if body:
            start = body[0]
        else:
            start = None
            reached = True
        if frag:
            elems = XPath('//*[@id="%s"]' % frag)(root)
            if elems:
                start = elems[0]

        def node_depth(elem):
            ans = 0
            parent = elem.getparent()
            while parent is not None:
                parent = parent.getparent()
                ans += 1
            return ans

        # Layer the ToC based on nesting order in the source HTML
        current_depth = None
        parent = ans
        seen = set()
        links = []
        for elem in root.iterdescendants(etree.Element):
            if reached and elem.tag == XHTML('a') and elem.get('href', False):
                href = elem.get('href')
                href, frag = urldefrag(href)
                href = base_href + '/' + href
                text = xml2text(elem).strip()
                if (text, href, frag) in seen:
                    continue
                seen.add((text, href, frag))
                links.append((text, href, frag, node_depth(elem)))
            elif elem is start:
                reached = True

        depths = sorted(set(x[-1] for x in links))
        depth_map = {x: i for i, x in enumerate(depths)}
        for text, href, frag, depth in links:
            depth = depth_map[depth]
            if current_depth is None:
                current_depth = 0
                parent.add_item(href, frag, text)
            elif current_depth == depth:
                parent.add_item(href, frag, text)
            elif current_depth < depth:
                parent = parent[-1] if len(parent) > 0 else parent
                parent.add_item(href, frag, text)
                current_depth += 1
            else:
                delta = current_depth - depth
                while delta > 0 and parent.parent is not None:
                    parent = parent.parent
                    delta -= 1
                parent.add_item(href, frag, text)
                current_depth = depth
        return ans

Example #27

0

Show file

    def read_inline_toc(self, href, frag):
        ans = TOC()
        base_href = '/'.join(href.split('/')[:-1])
        with open(href.replace('/', os.sep), 'rb') as f:
            raw = f.read().decode(self.header.codec)
        root = parse_html(raw, log=self.log)
        body = XPath('//h:body')(root)
        reached = False
        if body:
            start = body[0]
        else:
            start = None
            reached = True
        if frag:
            elems = XPath('//*[@id="%s"]'%frag)(root)
            if elems:
                start = elems[0]

        def node_depth(elem):
            ans = 0
            parent = elem.getparent()
            while parent is not None:
                parent = parent.getparent()
                ans += 1
            return ans

        # Layer the ToC based on nesting order in the source HTML
        current_depth = None
        parent = ans
        seen = set()
        links = []
        for elem in root.iterdescendants(etree.Element):
            if reached and elem.tag == XHTML('a') and elem.get('href',
                    False):
                href = elem.get('href')
                href, frag = urldefrag(href)
                href = base_href + '/' + href
                text = xml2text(elem).strip()
                if (text, href, frag) in seen:
                    continue
                seen.add((text, href, frag))
                links.append((text, href, frag, node_depth(elem)))
            elif elem is start:
                reached = True

        depths = sorted(set(x[-1] for x in links))
        depth_map = {x:i for i, x in enumerate(depths)}
        for text, href, frag, depth in links:
            depth = depth_map[depth]
            if current_depth is None:
                current_depth = 0
                parent.add_item(href, frag, text)
            elif current_depth == depth:
                parent.add_item(href, frag, text)
            elif current_depth < depth:
                parent = parent[-1] if len(parent) > 0 else parent
                parent.add_item(href, frag, text)
                current_depth += 1
            else:
                delta = current_depth - depth
                while delta > 0 and parent.parent is not None:
                    parent = parent.parent
                    delta -= 1
                parent.add_item(href, frag, text)
                current_depth = depth
        return ans