def clean_stars(div: HtmlElement) -> None: for e in elements(div, "p[strong[em]]"): e.drop_tree() for e in elements(div, ".//hr"): e.drop_tree() div.insert(0, H1("Unspeakable Desolation Pouring Down From the Stars")) e = element(div, "./p[1]") h2 = H2(e.text_content().title()) replace(e, h2) e = element(div, "./p[strong[a]]") a = element(div, "./p/strong/a") p = P( CLASS("breakabove"), A(e.text_content(), CLASS("internal"), href=a.attrib["href"]), ) replace(e, p)
def main(): for div, data in posts(): h2 = element(div, "h2") a = element(h2, "a") a.drop_tag() h2.tag = "h1" clean(div, data) write(div, data) for div, data in quotes(): clean(div, data) div.insert(0, H1("Quote of the Day")) write(div, data) for div, data in preambles(): div.attrib.clear() div.attrib["class"] = "monthly-preamble" element(div, "./a").drop_tree() clean(div, data) write(div, data) for div, data in miscellany(): edited_version = OldBook / "Text" / (data.href.removesuffix(".htm") + ".xhtml") if edited_version.exists(): html: HtmlElement = parse(str(edited_version)).getroot() div = element(html, "body") div.tag = "div" for img in elements(div, ".//img"): file = Path(img.attrib["src"]).name img.attrib["src"] = copy_resource(data.date, file, OldBook / "Images", BigBook / "Images") else: clean_misc_text(div) clean(div, data) if ("unspeakable-desolation-pouring-down-from-the-stars-chapter" in data.name): clean_stars(div) write(div, data) update_bigbook_toc.run(BigBook / "Text")
def toc_as_html(toc, pdf, opts): pdf = pdf.engine.pdf indents = [] for i in xrange(1, 7): indents.extend((i, 1.4 * i)) html = HTML( HEAD( STYLE(''' .calibre-pdf-toc table { width: 100%% } .calibre-pdf-toc table tr td:last-of-type { text-align: right } .calibre-pdf-toc .level-0 { font-size: larger; } .calibre-pdf-toc .level-%d td:first-of-type { padding-left: %.1gem } .calibre-pdf-toc .level-%d td:first-of-type { padding-left: %.1gem } .calibre-pdf-toc .level-%d td:first-of-type { padding-left: %.1gem } .calibre-pdf-toc .level-%d td:first-of-type { padding-left: %.1gem } .calibre-pdf-toc .level-%d td:first-of-type { padding-left: %.1gem } .calibre-pdf-toc .level-%d td:first-of-type { padding-left: %.1gem } ''' % tuple(indents) + (opts.extra_css or ''))), BODY( H1(_('Table of Contents')), TABLE(), )) body = html[1] body.set('class', 'calibre-pdf-toc') process_children(toc, body[1], 0, pdf) return tostring(html, pretty_print=True, include_meta_content_type=True, encoding='utf-8')
def __call__(self): doc = self.docx.document relationships_by_id, relationships_by_type = self.docx.document_relationships self.fields(doc, self.log) self.read_styles(relationships_by_type) self.images(relationships_by_id) self.layers = OrderedDict() self.framed = [[]] self.frame_map = {} self.framed_map = {} self.anchor_map = {} self.link_map = defaultdict(list) self.link_source_map = {} self.toc_anchor = None self.block_runs = [] paras = [] self.log.debug('Converting Word markup to HTML') self.read_page_properties(doc) self.resolve_alternate_content(doc) self.current_rels = relationships_by_id for wp, page_properties in iteritems(self.page_map): self.current_page = page_properties if wp.tag.endswith('}p'): p = self.convert_p(wp) self.body.append(p) paras.append(wp) self.read_block_anchors(doc) self.styles.apply_contextual_spacing(paras) self.mark_block_runs(paras) # Apply page breaks at the start of every section, except the first # section (since that will be the start of the file) self.styles.apply_section_page_breaks(self.section_starts[1:]) notes_header = None orig_rid_map = self.images.rid_map if self.footnotes.has_notes: self.body.append(H1(self.notes_text)) notes_header = self.body[-1] notes_header.set('class', 'notes-header') for anchor, text, note in self.footnotes: dl = DL(id=anchor) dl.set('class', 'footnote') self.body.append(dl) dl.append(DT('[', A('←' + text, href='#back_%s' % anchor, title=text))) dl[-1][0].tail = ']' dl.append(DD()) paras = [] self.images.rid_map = self.current_rels = note.rels[0] for wp in note: if wp.tag.endswith('}tbl'): self.tables.register(wp, self.styles) self.page_map[wp] = self.current_page else: p = self.convert_p(wp) dl[-1].append(p) paras.append(wp) self.styles.apply_contextual_spacing(paras) self.mark_block_runs(paras) for p, wp in iteritems(self.object_map): if len(p) > 0 and not p.text and len(p[0]) > 0 and not p[0].text and p[0][0].get('class', None) == 'tab': # Paragraph uses tabs for indentation, convert to text-indent parent = p[0] tabs = [] for child in parent: if child.get('class', None) == 'tab': tabs.append(child) if child.tail: break else: break indent = len(tabs) * self.settings.default_tab_stop style = self.styles.resolve(wp) if style.text_indent is inherit or (hasattr(style.text_indent, 'endswith') and style.text_indent.endswith('pt')): if style.text_indent is not inherit: indent = float(style.text_indent[:-2]) + indent style.text_indent = '%.3gpt' % indent parent.text = tabs[-1].tail or '' list(map(parent.remove, tabs)) self.images.rid_map = orig_rid_map self.resolve_links() self.styles.cascade(self.layers) self.tables.apply_markup(self.object_map, self.page_map) numbered = [] for html_obj, obj in iteritems(self.object_map): raw = obj.get('calibre_num_id', None) if raw is not None: lvl, num_id = raw.partition(':')[0::2] try: lvl = int(lvl) except (TypeError, ValueError): lvl = 0 numbered.append((html_obj, num_id, lvl)) self.numbering.apply_markup(numbered, self.body, self.styles, self.object_map, self.images) self.apply_frames() if len(self.body) > 0: self.body.text = '\n\t' for child in self.body: child.tail = '\n\t' self.body[-1].tail = '\n' self.log.debug('Converting styles to CSS') self.styles.generate_classes() for html_obj, obj in iteritems(self.object_map): style = self.styles.resolve(obj) if style is not None: css = style.css if css: cls = self.styles.class_name(css) if cls: html_obj.set('class', cls) for html_obj, css in iteritems(self.framed_map): cls = self.styles.class_name(css) if cls: html_obj.set('class', cls) if notes_header is not None: for h in self.namespace.children(self.body, 'h1', 'h2', 'h3'): notes_header.tag = h.tag cls = h.get('class', None) if cls and cls != 'notes-header': notes_header.set('class', '%s notes-header' % cls) break self.fields.polish_markup(self.object_map) self.log.debug('Cleaning up redundant markup generated by Word') self.cover_image = cleanup_markup(self.log, self.html, self.styles, self.dest_dir, self.detect_cover, self.namespace.XPath) return self.write(doc)
def __call__(self): doc = self.docx.document relationships_by_id, relationships_by_type = self.docx.document_relationships self.read_styles(relationships_by_type) self.images(relationships_by_id) self.layers = OrderedDict() self.framed = [[]] self.framed_map = {} self.anchor_map = {} self.link_map = defaultdict(list) self.read_page_properties(doc) for wp, page_properties in self.page_map.iteritems(): self.current_page = page_properties p = self.convert_p(wp) self.body.append(p) notes_header = None if self.footnotes.has_notes: dl = DL() dl.set('class', 'notes') self.body.append(H1(self.notes_text)) notes_header = self.body[-1] notes_header.set('class', 'notes-header') self.body.append(dl) for anchor, text, note in self.footnotes: dl.append( DT('[', A('←' + text, href='#back_%s' % anchor, title=text), id=anchor)) dl[-1][0].tail = ']' dl.append(DD()) in_table = False for wp in note: if wp.tag.endswith('}tbl'): self.tables.register(wp) in_table = True continue if in_table: if ancestor(wp, 'w:tbl') is not None: self.tables.add(wp) else: in_table = False p = self.convert_p(wp) dl[-1].append(p) self.resolve_links(relationships_by_id) self.styles.cascade(self.layers) self.tables.apply_markup(self.object_map) numbered = [] for html_obj, obj in self.object_map.iteritems(): raw = obj.get('calibre_num_id', None) if raw is not None: lvl, num_id = raw.partition(':')[0::2] try: lvl = int(lvl) except (TypeError, ValueError): lvl = 0 numbered.append((html_obj, num_id, lvl)) self.numbering.apply_markup(numbered, self.body, self.styles, self.object_map) self.apply_frames() if len(self.body) > 0: self.body.text = '\n\t' for child in self.body: child.tail = '\n\t' self.body[-1].tail = '\n' self.styles.generate_classes() for html_obj, obj in self.object_map.iteritems(): style = self.styles.resolve(obj) if style is not None: css = style.css if css: cls = self.styles.class_name(css) if cls: html_obj.set('class', cls) for html_obj, css in self.framed_map.iteritems(): cls = self.styles.class_name(css) if cls: html_obj.set('class', cls) if notes_header is not None: for h in self.body.iterchildren('h1', 'h2', 'h3'): notes_header.tag = h.tag cls = h.get('class', None) if cls and cls != 'notes-header': notes_header.set('class', '%s notes-header' % cls) break return self.write()
def __call__(self): doc = self.docx.document relationships_by_id, relationships_by_type = self.docx.document_relationships self.fields(doc, self.log) self.read_styles(relationships_by_type) self.images(relationships_by_id) self.layers = OrderedDict() self.framed = [[]] self.framed_map = {} self.anchor_map = {} self.link_map = defaultdict(list) paras = [] self.log.debug('Converting Word markup to HTML') self.read_page_properties(doc) for wp, page_properties in self.page_map.iteritems(): self.current_page = page_properties if wp.tag.endswith('}p'): p = self.convert_p(wp) self.body.append(p) paras.append(wp) self.read_block_anchors(doc) self.styles.apply_contextual_spacing(paras) # Apply page breaks at the start of every section, except the first # section (since that will be the start of the file) self.styles.apply_section_page_breaks(self.section_starts[1:]) notes_header = None if self.footnotes.has_notes: dl = DL() dl.set('class', 'notes') self.body.append(H1(self.notes_text)) notes_header = self.body[-1] notes_header.set('class', 'notes-header') self.body.append(dl) for anchor, text, note in self.footnotes: dl.append(DT('[', A('←' + text, href='#back_%s' % anchor, title=text), id=anchor)) dl[-1][0].tail = ']' dl.append(DD()) paras = [] for wp in note: if wp.tag.endswith('}tbl'): self.tables.register(wp, self.styles) self.page_map[wp] = self.current_page else: p = self.convert_p(wp) dl[-1].append(p) paras.append(wp) self.styles.apply_contextual_spacing(paras) self.resolve_links(relationships_by_id) self.styles.cascade(self.layers) self.tables.apply_markup(self.object_map, self.page_map) numbered = [] for html_obj, obj in self.object_map.iteritems(): raw = obj.get('calibre_num_id', None) if raw is not None: lvl, num_id = raw.partition(':')[0::2] try: lvl = int(lvl) except (TypeError, ValueError): lvl = 0 numbered.append((html_obj, num_id, lvl)) self.numbering.apply_markup(numbered, self.body, self.styles, self.object_map, self.images) self.apply_frames() if len(self.body) > 0: self.body.text = '\n\t' for child in self.body: child.tail = '\n\t' self.body[-1].tail = '\n' self.log.debug('Converting styles to CSS') self.styles.generate_classes() for html_obj, obj in self.object_map.iteritems(): style = self.styles.resolve(obj) if style is not None: css = style.css if css: cls = self.styles.class_name(css) if cls: html_obj.set('class', cls) for html_obj, css in self.framed_map.iteritems(): cls = self.styles.class_name(css) if cls: html_obj.set('class', cls) if notes_header is not None: for h in children(self.body, 'h1', 'h2', 'h3'): notes_header.tag = h.tag cls = h.get('class', None) if cls and cls != 'notes-header': notes_header.set('class', '%s notes-header' % cls) break self.log.debug('Cleaning up redundant markup generated by Word') self.cover_image = cleanup_markup(self.log, self.html, self.styles, self.dest_dir, self.detect_cover) return self.write(doc)