Example #1
0
    def periodicalize_toc(self):
        #return #commit by arroz, you have to create a toc hiberarchy youself.

        from calibre.ebooks.oeb.base import TOC
        toc = self.oeb.toc
        if not toc:  # or len(self.oeb.spine) < 3:
            return
        if toc and toc[0].klass != 'periodical':
            #spine[0]为index.html
            #one, two = self.oeb.spine[0], self.oeb.spine[1]
            self.log.info('Converting TOC for MOBI periodical indexing...')

            articles = {}
            if toc.depth() < 3:
                # single section periodical
                #self.oeb.manifest.remove(one) #manifest删除的同时也会将spine项删除
                #self.oeb.manifest.remove(two)
                #新增一个节点,将之前的所有节点归于此节点下
                sections = [
                    TOC(klass='section',
                        title=_('All articles'),
                        href=self.oeb.spine[0].href)
                ]
                for x in toc:
                    sections[0].nodes.append(x)
            else:
                # multi-section periodical
                #self.oeb.manifest.remove(one)
                sections = list(toc)  # 顶节点和第一层节点的列表
                for x in sections:
                    x.klass = 'section'
                    articles_ = list(x)  #第二层节点
                    if articles_:  #去掉原先节点的链接,改变链接到第一篇文章
                        #self.oeb.manifest.remove(self.oeb.manifest.hrefs[x.href])
                        x.href = articles_[0].href

            for sec in sections:
                articles[id(sec)] = []
                for a in list(sec):
                    a.klass = 'article'
                    articles[id(sec)].append(a)
                    sec.nodes.remove(a)

            root = TOC(klass='periodical',
                       href=self.oeb.spine[0].href,
                       title=unicode(self.oeb.metadata.title[0]))
            #建立 root/Sections/artcicles三层结构
            for s in sections:
                if articles[id(s)]:
                    for a in articles[id(s)]:
                        s.nodes.append(a)
                    root.nodes.append(s)

            for x in list(toc.nodes):
                toc.nodes.remove(x)

            toc.nodes.append(root)

            # Fix up the periodical href to point to first section href
            toc.nodes[0].href = toc.nodes[0].nodes[0].href
Example #2
0
    def flatten_toc(self):
        from calibre.ebooks.oeb.base import TOC

        nroot = TOC()
        for x in self.oeb.toc.iterdescendants():
            nroot.add(x.title, x.href)
        self.oeb.toc = nroot
Example #3
0
    def periodicalize_toc(self):
        from calibre.ebooks.oeb.base import TOC
        toc = self.oeb.toc
        if not toc or len(self.oeb.spine) < 3:
            return
        if toc and toc[0].klass != 'periodical':
            one, two = self.oeb.spine[0], self.oeb.spine[1]
            self.log('Converting TOC for MOBI periodical indexing...')

            articles = {}
            if toc.depth() < 3:
                # single section periodical
                self.oeb.manifest.remove(one)
                self.oeb.manifest.remove(two)
                sections = [
                    TOC(klass='section',
                        title=_('All articles'),
                        href=self.oeb.spine[0].href)
                ]
                for x in toc:
                    sections[0].nodes.append(x)
            else:
                # multi-section periodical
                self.oeb.manifest.remove(one)
                sections = list(toc)
                for i, x in enumerate(sections):
                    x.klass = 'section'
                    articles_ = list(x)
                    if articles_:
                        self.oeb.manifest.remove(
                            self.oeb.manifest.hrefs[x.href])
                        x.href = articles_[0].href

            for sec in sections:
                articles[id(sec)] = []
                for a in list(sec):
                    a.klass = 'article'
                    articles[id(sec)].append(a)
                    sec.nodes.remove(a)

            root = TOC(klass='periodical',
                       href=self.oeb.spine[0].href,
                       title=unicode_type(self.oeb.metadata.title[0]))

            for s in sections:
                if articles[id(s)]:
                    for a in articles[id(s)]:
                        s.nodes.append(a)
                    root.nodes.append(s)

            for x in list(toc.nodes):
                toc.nodes.remove(x)

            toc.nodes.append(root)

            # Fix up the periodical href to point to first section href
            toc.nodes[0].href = toc.nodes[0].nodes[0].href
Example #4
0
 def _process_nodes(self, root):
     from calibre.ebooks.oeb.base import TOC
     toc = TOC()
     ancestor_map = {}
     for node in root.xpath('//object'):
         self.add_node(node, toc, ancestor_map)
     return toc
Example #5
0
    def __call__(self, oeb, opts):
        self.log = oeb.log
        self.oeb = oeb
        self.opts = opts
        self.log('Detecting structure...')

        self.detect_chapters()
        if self.oeb.auto_generated_toc or opts.use_auto_toc:
            orig_toc = self.oeb.toc
            self.oeb.toc = TOC()
            self.create_level_based_toc()
            if self.oeb.toc.count() < 1:
                if not opts.no_chapters_in_toc and self.detected_chapters:
                    self.create_toc_from_chapters()
                if self.oeb.toc.count() < opts.toc_threshold:
                    self.create_toc_from_links()
            if self.oeb.toc.count() < 2 and orig_toc.count() > 2:
                self.oeb.toc = orig_toc
            else:
                self.oeb.auto_generated_toc = True
                self.log('Auto generated TOC with %d entries.' %
                         self.oeb.toc.count())

        if opts.toc_filter is not None:
            regexp = re.compile(opts.toc_filter)
            for node in list(self.oeb.toc.iter()):
                if not node.title or regexp.search(node.title) is not None:
                    self.log('Filtering',
                             node.title if node.title else 'empty node',
                             'from TOC')
                    self.oeb.toc.remove(node)

        if opts.page_breaks_before is not None:
            pb_xpath = XPath(opts.page_breaks_before)
            for item in oeb.spine:
                for elem in pb_xpath(item.data):
                    try:
                        prev = next(
                            elem.itersiblings(tag=etree.Element,
                                              preceding=True))
                        if (barename(elem.tag) in {'h1', 'h2'}
                                and barename(prev.tag) in {'h1', 'h2'}
                                and (not prev.tail or not prev.tail.split())):
                            # We have two adjacent headings, do not put a page
                            # break on the second one
                            continue
                    except StopIteration:
                        pass

                    style = elem.get('style', '')
                    if style:
                        style += '; '
                    elem.set('style', style + 'page-break-before:always')

        for node in self.oeb.toc.iter():
            if not node.title or not node.title.strip():
                node.title = _('Unnamed')

        if self.opts.start_reading_at:
            self.detect_start_reading()
Example #6
0
    def parse_html_toc(self, item):
        from calibre.ebooks.oeb.base import TOC, XPath
        dx = XPath('./h:div')
        ax = XPath('./h:a[1]')

        def do_node(parent, div):
            for child in dx(div):
                a = ax(child)[0]
                c = parent.add(a.text, a.attrib['href'])
                do_node(c, child)

        toc = TOC()
        root = XPath('//h:div[1]')(item.data)[0]
        do_node(toc, root)
        return toc
Example #7
0
def sony_metadata(oeb):
    m = oeb.metadata
    title = short_title = str(m.title[0])
    publisher = __appname__ + ' ' + __version__
    try:
        pt = str(oeb.metadata.publication_type[0])
        short_title = ':'.join(pt.split(':')[2:])
    except:
        pass

    try:
        date = parse_date(str(m.date[0]),
                as_utc=False).strftime('%Y-%m-%d')
    except:
        date = strftime('%Y-%m-%d')
    try:
        language = str(m.language[0]).replace('_', '-')
    except:
        language = 'en'
    short_title = xml(short_title, True)

    metadata = SONY_METADATA.format(title=xml(title),
            short_title=short_title,
            publisher=xml(publisher), issue_date=xml(date),
            language=xml(language))

    updated = strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())

    def cal_id(x):
        for k, v in x.attrib.items():
            if k.endswith('scheme') and v == 'uuid':
                return True

    try:
        base_id = str(list(filter(cal_id, m.identifier))[0])
    except:
        base_id = str(uuid4())

    toc = oeb.toc

    if False and toc.depth() < 3:
        # Single section periodical
        # Disabled since I prefer the current behavior
        from calibre.ebooks.oeb.base import TOC
        section = TOC(klass='section', title=_('All articles'),
                    href=oeb.spine[2].href)
        for x in toc:
            section.nodes.append(x)
        toc = TOC(klass='periodical', href=oeb.spine[2].href,
                    title=str(oeb.metadata.title[0]))
        toc.nodes.append(section)

    entries = []
    seen_titles = set()
    for i, section in enumerate(toc):
        if not section.href:
            continue
        secid = 'section%d'%i
        sectitle = section.title
        if not sectitle:
            sectitle = _('Unknown')
        d = 1
        bsectitle = sectitle
        while sectitle in seen_titles:
            sectitle = bsectitle + ' ' + str(d)
            d += 1
        seen_titles.add(sectitle)
        sectitle = xml(sectitle, True)
        secdesc = section.description
        if not secdesc:
            secdesc = ''
        secdesc = xml(secdesc)
        entries.append(SONY_ATOM_SECTION.format(title=sectitle,
            href=section.href, id=xml(base_id)+'/'+secid,
            short_title=short_title, desc=secdesc, updated=updated))

        for j, article in enumerate(section):
            if not article.href:
                continue
            atitle = article.title
            btitle = atitle
            d = 1
            while atitle in seen_titles:
                atitle = btitle + ' ' + str(d)
                d += 1

            auth = article.author if article.author else ''
            desc = section.description
            if not desc:
                desc = ''
            aid = 'article%d'%j

            entries.append(SONY_ATOM_ENTRY.format(
                title=xml(atitle),
                author=xml(auth),
                updated=updated,
                desc=desc,
                short_title=short_title,
                section_title=sectitle,
                href=article.href,
                word_count=str(1),
                id=xml(base_id)+'/'+secid+'/'+aid
            ))

    atom = SONY_ATOM.format(short_title=short_title,
            entries='\n\n'.join(entries), updated=updated,
            id=xml(base_id)).encode('utf-8')

    return metadata, atom
Example #8
0
 def flatten_toc(self):
     from calibre.ebooks.oeb.base import TOC
     nroot = TOC()
     for x in self.oeb.toc.iterdescendants():
         nroot.add(x.title, x.href)
     self.oeb.toc = nroot