Ejemplo n.º 1
0
def clean_stars(div: HtmlElement) -> None:
    for e in elements(div, "p[strong[em]]"):
        e.drop_tree()
    for e in elements(div, ".//hr"):
        e.drop_tree()

    div.insert(0, H1("Unspeakable Desolation Pouring Down From the Stars"))
    e = element(div, "./p[1]")
    h2 = H2(e.text_content().title())
    replace(e, h2)

    e = element(div, "./p[strong[a]]")
    a = element(div, "./p/strong/a")
    p = P(
        CLASS("breakabove"),
        A(e.text_content(), CLASS("internal"), href=a.attrib["href"]),
    )
    replace(e, p)
Ejemplo n.º 2
0
def main():
    for div, data in posts():
        h2 = element(div, "h2")
        a = element(h2, "a")
        a.drop_tag()
        h2.tag = "h1"
        clean(div, data)
        write(div, data)

    for div, data in quotes():
        clean(div, data)

        div.insert(0, H1("Quote of the Day"))
        write(div, data)

    for div, data in preambles():
        div.attrib.clear()
        div.attrib["class"] = "monthly-preamble"
        element(div, "./a").drop_tree()
        clean(div, data)
        write(div, data)

    for div, data in miscellany():
        edited_version = OldBook / "Text" / (data.href.removesuffix(".htm") +
                                             ".xhtml")
        if edited_version.exists():
            html: HtmlElement = parse(str(edited_version)).getroot()
            div = element(html, "body")
            div.tag = "div"
            for img in elements(div, ".//img"):
                file = Path(img.attrib["src"]).name
                img.attrib["src"] = copy_resource(data.date, file,
                                                  OldBook / "Images",
                                                  BigBook / "Images")
        else:
            clean_misc_text(div)
            clean(div, data)
            if ("unspeakable-desolation-pouring-down-from-the-stars-chapter"
                    in data.name):
                clean_stars(div)
        write(div, data)

    update_bigbook_toc.run(BigBook / "Text")
Ejemplo n.º 3
0
def toc_as_html(toc, pdf, opts):
    pdf = pdf.engine.pdf
    indents = []
    for i in xrange(1, 7):
        indents.extend((i, 1.4 * i))
    html = HTML(
        HEAD(
            STYLE('''
            .calibre-pdf-toc table { width: 100%% }

            .calibre-pdf-toc table tr td:last-of-type { text-align: right }

            .calibre-pdf-toc .level-0 {
                font-size: larger;
            }

            .calibre-pdf-toc .level-%d td:first-of-type { padding-left: %.1gem }
            .calibre-pdf-toc .level-%d td:first-of-type { padding-left: %.1gem }
            .calibre-pdf-toc .level-%d td:first-of-type { padding-left: %.1gem }
            .calibre-pdf-toc .level-%d td:first-of-type { padding-left: %.1gem }
            .calibre-pdf-toc .level-%d td:first-of-type { padding-left: %.1gem }
            .calibre-pdf-toc .level-%d td:first-of-type { padding-left: %.1gem }
            ''' % tuple(indents) + (opts.extra_css or ''))),
        BODY(
            H1(_('Table of Contents')),
            TABLE(),
        ))
    body = html[1]
    body.set('class', 'calibre-pdf-toc')

    process_children(toc, body[1], 0, pdf)

    return tostring(html,
                    pretty_print=True,
                    include_meta_content_type=True,
                    encoding='utf-8')
Ejemplo n.º 4
0
    def __call__(self):
        doc = self.docx.document
        relationships_by_id, relationships_by_type = self.docx.document_relationships
        self.fields(doc, self.log)
        self.read_styles(relationships_by_type)
        self.images(relationships_by_id)
        self.layers = OrderedDict()
        self.framed = [[]]
        self.frame_map = {}
        self.framed_map = {}
        self.anchor_map = {}
        self.link_map = defaultdict(list)
        self.link_source_map = {}
        self.toc_anchor = None
        self.block_runs = []
        paras = []

        self.log.debug('Converting Word markup to HTML')

        self.read_page_properties(doc)
        self.resolve_alternate_content(doc)
        self.current_rels = relationships_by_id
        for wp, page_properties in iteritems(self.page_map):
            self.current_page = page_properties
            if wp.tag.endswith('}p'):
                p = self.convert_p(wp)
                self.body.append(p)
                paras.append(wp)

        self.read_block_anchors(doc)
        self.styles.apply_contextual_spacing(paras)
        self.mark_block_runs(paras)
        # Apply page breaks at the start of every section, except the first
        # section (since that will be the start of the file)
        self.styles.apply_section_page_breaks(self.section_starts[1:])

        notes_header = None
        orig_rid_map = self.images.rid_map
        if self.footnotes.has_notes:
            self.body.append(H1(self.notes_text))
            notes_header = self.body[-1]
            notes_header.set('class', 'notes-header')
            for anchor, text, note in self.footnotes:
                dl = DL(id=anchor)
                dl.set('class', 'footnote')
                self.body.append(dl)
                dl.append(DT('[', A('←' + text, href='#back_%s' % anchor, title=text)))
                dl[-1][0].tail = ']'
                dl.append(DD())
                paras = []
                self.images.rid_map = self.current_rels = note.rels[0]
                for wp in note:
                    if wp.tag.endswith('}tbl'):
                        self.tables.register(wp, self.styles)
                        self.page_map[wp] = self.current_page
                    else:
                        p = self.convert_p(wp)
                        dl[-1].append(p)
                        paras.append(wp)
                self.styles.apply_contextual_spacing(paras)
                self.mark_block_runs(paras)

        for p, wp in iteritems(self.object_map):
            if len(p) > 0 and not p.text and len(p[0]) > 0 and not p[0].text and p[0][0].get('class', None) == 'tab':
                # Paragraph uses tabs for indentation, convert to text-indent
                parent = p[0]
                tabs = []
                for child in parent:
                    if child.get('class', None) == 'tab':
                        tabs.append(child)
                        if child.tail:
                            break
                    else:
                        break
                indent = len(tabs) * self.settings.default_tab_stop
                style = self.styles.resolve(wp)
                if style.text_indent is inherit or (hasattr(style.text_indent, 'endswith') and style.text_indent.endswith('pt')):
                    if style.text_indent is not inherit:
                        indent = float(style.text_indent[:-2]) + indent
                    style.text_indent = '%.3gpt' % indent
                    parent.text = tabs[-1].tail or ''
                    list(map(parent.remove, tabs))

        self.images.rid_map = orig_rid_map

        self.resolve_links()

        self.styles.cascade(self.layers)

        self.tables.apply_markup(self.object_map, self.page_map)

        numbered = []
        for html_obj, obj in iteritems(self.object_map):
            raw = obj.get('calibre_num_id', None)
            if raw is not None:
                lvl, num_id = raw.partition(':')[0::2]
                try:
                    lvl = int(lvl)
                except (TypeError, ValueError):
                    lvl = 0
                numbered.append((html_obj, num_id, lvl))
        self.numbering.apply_markup(numbered, self.body, self.styles, self.object_map, self.images)
        self.apply_frames()

        if len(self.body) > 0:
            self.body.text = '\n\t'
            for child in self.body:
                child.tail = '\n\t'
            self.body[-1].tail = '\n'

        self.log.debug('Converting styles to CSS')
        self.styles.generate_classes()
        for html_obj, obj in iteritems(self.object_map):
            style = self.styles.resolve(obj)
            if style is not None:
                css = style.css
                if css:
                    cls = self.styles.class_name(css)
                    if cls:
                        html_obj.set('class', cls)
        for html_obj, css in iteritems(self.framed_map):
            cls = self.styles.class_name(css)
            if cls:
                html_obj.set('class', cls)

        if notes_header is not None:
            for h in self.namespace.children(self.body, 'h1', 'h2', 'h3'):
                notes_header.tag = h.tag
                cls = h.get('class', None)
                if cls and cls != 'notes-header':
                    notes_header.set('class', '%s notes-header' % cls)
                break

        self.fields.polish_markup(self.object_map)

        self.log.debug('Cleaning up redundant markup generated by Word')
        self.cover_image = cleanup_markup(self.log, self.html, self.styles, self.dest_dir, self.detect_cover, self.namespace.XPath)

        return self.write(doc)
Ejemplo n.º 5
0
    def __call__(self):
        doc = self.docx.document
        relationships_by_id, relationships_by_type = self.docx.document_relationships
        self.read_styles(relationships_by_type)
        self.images(relationships_by_id)
        self.layers = OrderedDict()
        self.framed = [[]]
        self.framed_map = {}
        self.anchor_map = {}
        self.link_map = defaultdict(list)

        self.read_page_properties(doc)
        for wp, page_properties in self.page_map.iteritems():
            self.current_page = page_properties
            p = self.convert_p(wp)
            self.body.append(p)

        notes_header = None
        if self.footnotes.has_notes:
            dl = DL()
            dl.set('class', 'notes')
            self.body.append(H1(self.notes_text))
            notes_header = self.body[-1]
            notes_header.set('class', 'notes-header')
            self.body.append(dl)
            for anchor, text, note in self.footnotes:
                dl.append(
                    DT('[',
                       A('←' + text, href='#back_%s' % anchor, title=text),
                       id=anchor))
                dl[-1][0].tail = ']'
                dl.append(DD())
                in_table = False
                for wp in note:
                    if wp.tag.endswith('}tbl'):
                        self.tables.register(wp)
                        in_table = True
                        continue
                    if in_table:
                        if ancestor(wp, 'w:tbl') is not None:
                            self.tables.add(wp)
                        else:
                            in_table = False
                    p = self.convert_p(wp)
                    dl[-1].append(p)

        self.resolve_links(relationships_by_id)

        self.styles.cascade(self.layers)

        self.tables.apply_markup(self.object_map)

        numbered = []
        for html_obj, obj in self.object_map.iteritems():
            raw = obj.get('calibre_num_id', None)
            if raw is not None:
                lvl, num_id = raw.partition(':')[0::2]
                try:
                    lvl = int(lvl)
                except (TypeError, ValueError):
                    lvl = 0
                numbered.append((html_obj, num_id, lvl))
        self.numbering.apply_markup(numbered, self.body, self.styles,
                                    self.object_map)
        self.apply_frames()

        if len(self.body) > 0:
            self.body.text = '\n\t'
            for child in self.body:
                child.tail = '\n\t'
            self.body[-1].tail = '\n'

        self.styles.generate_classes()
        for html_obj, obj in self.object_map.iteritems():
            style = self.styles.resolve(obj)
            if style is not None:
                css = style.css
                if css:
                    cls = self.styles.class_name(css)
                    if cls:
                        html_obj.set('class', cls)
        for html_obj, css in self.framed_map.iteritems():
            cls = self.styles.class_name(css)
            if cls:
                html_obj.set('class', cls)

        if notes_header is not None:
            for h in self.body.iterchildren('h1', 'h2', 'h3'):
                notes_header.tag = h.tag
                cls = h.get('class', None)
                if cls and cls != 'notes-header':
                    notes_header.set('class', '%s notes-header' % cls)
                break

        return self.write()
Ejemplo n.º 6
0
    def __call__(self):
        doc = self.docx.document
        relationships_by_id, relationships_by_type = self.docx.document_relationships
        self.fields(doc, self.log)
        self.read_styles(relationships_by_type)
        self.images(relationships_by_id)
        self.layers = OrderedDict()
        self.framed = [[]]
        self.framed_map = {}
        self.anchor_map = {}
        self.link_map = defaultdict(list)
        paras = []

        self.log.debug('Converting Word markup to HTML')
        self.read_page_properties(doc)
        for wp, page_properties in self.page_map.iteritems():
            self.current_page = page_properties
            if wp.tag.endswith('}p'):
                p = self.convert_p(wp)
                self.body.append(p)
                paras.append(wp)
        self.read_block_anchors(doc)
        self.styles.apply_contextual_spacing(paras)
        # Apply page breaks at the start of every section, except the first
        # section (since that will be the start of the file)
        self.styles.apply_section_page_breaks(self.section_starts[1:])

        notes_header = None
        if self.footnotes.has_notes:
            dl = DL()
            dl.set('class', 'notes')
            self.body.append(H1(self.notes_text))
            notes_header = self.body[-1]
            notes_header.set('class', 'notes-header')
            self.body.append(dl)
            for anchor, text, note in self.footnotes:
                dl.append(DT('[', A('←' + text, href='#back_%s' % anchor, title=text), id=anchor))
                dl[-1][0].tail = ']'
                dl.append(DD())
                paras = []
                for wp in note:
                    if wp.tag.endswith('}tbl'):
                        self.tables.register(wp, self.styles)
                        self.page_map[wp] = self.current_page
                    else:
                        p = self.convert_p(wp)
                        dl[-1].append(p)
                        paras.append(wp)
                self.styles.apply_contextual_spacing(paras)

        self.resolve_links(relationships_by_id)

        self.styles.cascade(self.layers)

        self.tables.apply_markup(self.object_map, self.page_map)

        numbered = []
        for html_obj, obj in self.object_map.iteritems():
            raw = obj.get('calibre_num_id', None)
            if raw is not None:
                lvl, num_id = raw.partition(':')[0::2]
                try:
                    lvl = int(lvl)
                except (TypeError, ValueError):
                    lvl = 0
                numbered.append((html_obj, num_id, lvl))
        self.numbering.apply_markup(numbered, self.body, self.styles, self.object_map, self.images)
        self.apply_frames()

        if len(self.body) > 0:
            self.body.text = '\n\t'
            for child in self.body:
                child.tail = '\n\t'
            self.body[-1].tail = '\n'

        self.log.debug('Converting styles to CSS')
        self.styles.generate_classes()
        for html_obj, obj in self.object_map.iteritems():
            style = self.styles.resolve(obj)
            if style is not None:
                css = style.css
                if css:
                    cls = self.styles.class_name(css)
                    if cls:
                        html_obj.set('class', cls)
        for html_obj, css in self.framed_map.iteritems():
            cls = self.styles.class_name(css)
            if cls:
                html_obj.set('class', cls)

        if notes_header is not None:
            for h in children(self.body, 'h1', 'h2', 'h3'):
                notes_header.tag = h.tag
                cls = h.get('class', None)
                if cls and cls != 'notes-header':
                    notes_header.set('class', '%s notes-header' % cls)
                break

        self.log.debug('Cleaning up redundant markup generated by Word')
        self.cover_image = cleanup_markup(self.log, self.html, self.styles, self.dest_dir, self.detect_cover)

        return self.write(doc)