def periodicalize_toc(self): #return #commit by arroz, you have to create a toc hiberarchy youself. from calibre.ebooks.oeb.base import TOC toc = self.oeb.toc if not toc: # or len(self.oeb.spine) < 3: return if toc and toc[0].klass != 'periodical': #spine[0]为index.html #one, two = self.oeb.spine[0], self.oeb.spine[1] self.log.info('Converting TOC for MOBI periodical indexing...') articles = {} if toc.depth() < 3: # single section periodical #self.oeb.manifest.remove(one) #manifest删除的同时也会将spine项删除 #self.oeb.manifest.remove(two) #新增一个节点,将之前的所有节点归于此节点下 sections = [ TOC(klass='section', title=_('All articles'), href=self.oeb.spine[0].href) ] for x in toc: sections[0].nodes.append(x) else: # multi-section periodical #self.oeb.manifest.remove(one) sections = list(toc) # 顶节点和第一层节点的列表 for x in sections: x.klass = 'section' articles_ = list(x) #第二层节点 if articles_: #去掉原先节点的链接,改变链接到第一篇文章 #self.oeb.manifest.remove(self.oeb.manifest.hrefs[x.href]) x.href = articles_[0].href for sec in sections: articles[id(sec)] = [] for a in list(sec): a.klass = 'article' articles[id(sec)].append(a) sec.nodes.remove(a) root = TOC(klass='periodical', href=self.oeb.spine[0].href, title=unicode(self.oeb.metadata.title[0])) #建立 root/Sections/artcicles三层结构 for s in sections: if articles[id(s)]: for a in articles[id(s)]: s.nodes.append(a) root.nodes.append(s) for x in list(toc.nodes): toc.nodes.remove(x) toc.nodes.append(root) # Fix up the periodical href to point to first section href toc.nodes[0].href = toc.nodes[0].nodes[0].href
def flatten_toc(self): from calibre.ebooks.oeb.base import TOC nroot = TOC() for x in self.oeb.toc.iterdescendants(): nroot.add(x.title, x.href) self.oeb.toc = nroot
def periodicalize_toc(self): from calibre.ebooks.oeb.base import TOC toc = self.oeb.toc if not toc or len(self.oeb.spine) < 3: return if toc and toc[0].klass != 'periodical': one, two = self.oeb.spine[0], self.oeb.spine[1] self.log('Converting TOC for MOBI periodical indexing...') articles = {} if toc.depth() < 3: # single section periodical self.oeb.manifest.remove(one) self.oeb.manifest.remove(two) sections = [ TOC(klass='section', title=_('All articles'), href=self.oeb.spine[0].href) ] for x in toc: sections[0].nodes.append(x) else: # multi-section periodical self.oeb.manifest.remove(one) sections = list(toc) for i, x in enumerate(sections): x.klass = 'section' articles_ = list(x) if articles_: self.oeb.manifest.remove( self.oeb.manifest.hrefs[x.href]) x.href = articles_[0].href for sec in sections: articles[id(sec)] = [] for a in list(sec): a.klass = 'article' articles[id(sec)].append(a) sec.nodes.remove(a) root = TOC(klass='periodical', href=self.oeb.spine[0].href, title=unicode_type(self.oeb.metadata.title[0])) for s in sections: if articles[id(s)]: for a in articles[id(s)]: s.nodes.append(a) root.nodes.append(s) for x in list(toc.nodes): toc.nodes.remove(x) toc.nodes.append(root) # Fix up the periodical href to point to first section href toc.nodes[0].href = toc.nodes[0].nodes[0].href
def _process_nodes(self, root): from calibre.ebooks.oeb.base import TOC toc = TOC() ancestor_map = {} for node in root.xpath('//object'): self.add_node(node, toc, ancestor_map) return toc
def __call__(self, oeb, opts): self.log = oeb.log self.oeb = oeb self.opts = opts self.log('Detecting structure...') self.detect_chapters() if self.oeb.auto_generated_toc or opts.use_auto_toc: orig_toc = self.oeb.toc self.oeb.toc = TOC() self.create_level_based_toc() if self.oeb.toc.count() < 1: if not opts.no_chapters_in_toc and self.detected_chapters: self.create_toc_from_chapters() if self.oeb.toc.count() < opts.toc_threshold: self.create_toc_from_links() if self.oeb.toc.count() < 2 and orig_toc.count() > 2: self.oeb.toc = orig_toc else: self.oeb.auto_generated_toc = True self.log('Auto generated TOC with %d entries.' % self.oeb.toc.count()) if opts.toc_filter is not None: regexp = re.compile(opts.toc_filter) for node in list(self.oeb.toc.iter()): if not node.title or regexp.search(node.title) is not None: self.log('Filtering', node.title if node.title else 'empty node', 'from TOC') self.oeb.toc.remove(node) if opts.page_breaks_before is not None: pb_xpath = XPath(opts.page_breaks_before) for item in oeb.spine: for elem in pb_xpath(item.data): try: prev = next( elem.itersiblings(tag=etree.Element, preceding=True)) if (barename(elem.tag) in {'h1', 'h2'} and barename(prev.tag) in {'h1', 'h2'} and (not prev.tail or not prev.tail.split())): # We have two adjacent headings, do not put a page # break on the second one continue except StopIteration: pass style = elem.get('style', '') if style: style += '; ' elem.set('style', style + 'page-break-before:always') for node in self.oeb.toc.iter(): if not node.title or not node.title.strip(): node.title = _('Unnamed') if self.opts.start_reading_at: self.detect_start_reading()
def parse_html_toc(self, item): from calibre.ebooks.oeb.base import TOC, XPath dx = XPath('./h:div') ax = XPath('./h:a[1]') def do_node(parent, div): for child in dx(div): a = ax(child)[0] c = parent.add(a.text, a.attrib['href']) do_node(c, child) toc = TOC() root = XPath('//h:div[1]')(item.data)[0] do_node(toc, root) return toc
def sony_metadata(oeb): m = oeb.metadata title = short_title = str(m.title[0]) publisher = __appname__ + ' ' + __version__ try: pt = str(oeb.metadata.publication_type[0]) short_title = ':'.join(pt.split(':')[2:]) except: pass try: date = parse_date(str(m.date[0]), as_utc=False).strftime('%Y-%m-%d') except: date = strftime('%Y-%m-%d') try: language = str(m.language[0]).replace('_', '-') except: language = 'en' short_title = xml(short_title, True) metadata = SONY_METADATA.format(title=xml(title), short_title=short_title, publisher=xml(publisher), issue_date=xml(date), language=xml(language)) updated = strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()) def cal_id(x): for k, v in x.attrib.items(): if k.endswith('scheme') and v == 'uuid': return True try: base_id = str(list(filter(cal_id, m.identifier))[0]) except: base_id = str(uuid4()) toc = oeb.toc if False and toc.depth() < 3: # Single section periodical # Disabled since I prefer the current behavior from calibre.ebooks.oeb.base import TOC section = TOC(klass='section', title=_('All articles'), href=oeb.spine[2].href) for x in toc: section.nodes.append(x) toc = TOC(klass='periodical', href=oeb.spine[2].href, title=str(oeb.metadata.title[0])) toc.nodes.append(section) entries = [] seen_titles = set() for i, section in enumerate(toc): if not section.href: continue secid = 'section%d'%i sectitle = section.title if not sectitle: sectitle = _('Unknown') d = 1 bsectitle = sectitle while sectitle in seen_titles: sectitle = bsectitle + ' ' + str(d) d += 1 seen_titles.add(sectitle) sectitle = xml(sectitle, True) secdesc = section.description if not secdesc: secdesc = '' secdesc = xml(secdesc) entries.append(SONY_ATOM_SECTION.format(title=sectitle, href=section.href, id=xml(base_id)+'/'+secid, short_title=short_title, desc=secdesc, updated=updated)) for j, article in enumerate(section): if not article.href: continue atitle = article.title btitle = atitle d = 1 while atitle in seen_titles: atitle = btitle + ' ' + str(d) d += 1 auth = article.author if article.author else '' desc = section.description if not desc: desc = '' aid = 'article%d'%j entries.append(SONY_ATOM_ENTRY.format( title=xml(atitle), author=xml(auth), updated=updated, desc=desc, short_title=short_title, section_title=sectitle, href=article.href, word_count=str(1), id=xml(base_id)+'/'+secid+'/'+aid )) atom = SONY_ATOM.format(short_title=short_title, entries='\n\n'.join(entries), updated=updated, id=xml(base_id)).encode('utf-8') return metadata, atom