def __call__(self): doc = self.docx.document relationships_by_id, relationships_by_type = self.docx.document_relationships self.fields(doc, self.log) self.read_styles(relationships_by_type) self.images(relationships_by_id) self.layers = OrderedDict() self.framed = [[]] self.framed_map = {} self.anchor_map = {} self.link_map = defaultdict(list) self.link_source_map = {} paras = [] self.log.debug('Converting Word markup to HTML') self.read_page_properties(doc) self.current_rels = relationships_by_id for wp, page_properties in self.page_map.iteritems(): self.current_page = page_properties if wp.tag.endswith('}p'): p = self.convert_p(wp) self.body.append(p) paras.append(wp) self.read_block_anchors(doc) self.styles.apply_contextual_spacing(paras) # Apply page breaks at the start of every section, except the first # section (since that will be the start of the file) self.styles.apply_section_page_breaks(self.section_starts[1:]) notes_header = None orig_rid_map = self.images.rid_map if self.footnotes.has_notes: dl = DL() dl.set('class', 'notes') self.body.append(H1(self.notes_text)) notes_header = self.body[-1] notes_header.set('class', 'notes-header') self.body.append(dl) for anchor, text, note in self.footnotes: dl.append( DT('[', A('←' + text, href='#back_%s' % anchor, title=text), id=anchor)) dl[-1][0].tail = ']' dl.append(DD()) paras = [] self.images.rid_map = self.current_rels = note.rels[0] for wp in note: if wp.tag.endswith('}tbl'): self.tables.register(wp, self.styles) self.page_map[wp] = self.current_page else: p = self.convert_p(wp) dl[-1].append(p) paras.append(wp) self.styles.apply_contextual_spacing(paras) for p, wp in self.object_map.iteritems(): if len(p) > 0 and not p.text and len( p[0]) > 0 and not p[0].text and p[0][0].get('class', None) == 'tab': # Paragraph uses tabs for indentation, convert to text-indent parent = p[0] tabs = [] for child in parent: if child.get('class', None) == 'tab': tabs.append(child) if child.tail: break else: break indent = len(tabs) * self.settings.default_tab_stop style = self.styles.resolve(wp) if style.text_indent is inherit or ( hasattr(style.text_indent, 'endswith') and style.text_indent.endswith('pt')): if style.text_indent is not inherit: indent = float(style.text_indent[:-2]) + indent style.text_indent = '%.3gpt' % indent parent.text = tabs[-1].tail or '' map(parent.remove, tabs) self.images.rid_map = orig_rid_map self.resolve_links() self.styles.cascade(self.layers) self.tables.apply_markup(self.object_map, self.page_map) numbered = [] for html_obj, obj in self.object_map.iteritems(): raw = obj.get('calibre_num_id', None) if raw is not None: lvl, num_id = raw.partition(':')[0::2] try: lvl = int(lvl) except (TypeError, ValueError): lvl = 0 numbered.append((html_obj, num_id, lvl)) self.numbering.apply_markup(numbered, self.body, self.styles, self.object_map, self.images) self.apply_frames() if len(self.body) > 0: self.body.text = '\n\t' for child in self.body: child.tail = '\n\t' self.body[-1].tail = '\n' self.log.debug('Converting styles to CSS') self.styles.generate_classes() for html_obj, obj in self.object_map.iteritems(): style = self.styles.resolve(obj) if style is not None: css = style.css if css: cls = self.styles.class_name(css) if cls: html_obj.set('class', cls) for html_obj, css in self.framed_map.iteritems(): cls = self.styles.class_name(css) if cls: html_obj.set('class', cls) if notes_header is not None: for h in children(self.body, 'h1', 'h2', 'h3'): notes_header.tag = h.tag cls = h.get('class', None) if cls and cls != 'notes-header': notes_header.set('class', '%s notes-header' % cls) break self.fields.polish_markup(self.object_map) self.log.debug('Cleaning up redundant markup generated by Word') self.cover_image = cleanup_markup(self.log, self.html, self.styles, self.dest_dir, self.detect_cover) return self.write(doc)
def __call__(self): doc = self.docx.document relationships_by_id, relationships_by_type = self.docx.document_relationships self.read_styles(relationships_by_type) self.images(relationships_by_id) self.layers = OrderedDict() self.framed = [[]] self.framed_map = {} self.anchor_map = {} self.link_map = defaultdict(list) self.read_page_properties(doc) for wp, page_properties in self.page_map.iteritems(): self.current_page = page_properties if wp.tag.endswith('}p'): p = self.convert_p(wp) self.body.append(p) notes_header = None if self.footnotes.has_notes: dl = DL() dl.set('class', 'notes') self.body.append(H1(self.notes_text)) notes_header = self.body[-1] notes_header.set('class', 'notes-header') self.body.append(dl) for anchor, text, note in self.footnotes: dl.append(DT('[', A('←' + text, href='#back_%s' % anchor, title=text), id=anchor)) dl[-1][0].tail = ']' dl.append(DD()) for wp in note: if wp.tag.endswith('}tbl'): self.tables.register(wp, self.styles) self.page_map[wp] = self.current_page p = self.convert_p(wp) dl[-1].append(p) self.resolve_links(relationships_by_id) self.styles.cascade(self.layers) self.tables.apply_markup(self.object_map, self.page_map) numbered = [] for html_obj, obj in self.object_map.iteritems(): raw = obj.get('calibre_num_id', None) if raw is not None: lvl, num_id = raw.partition(':')[0::2] try: lvl = int(lvl) except (TypeError, ValueError): lvl = 0 numbered.append((html_obj, num_id, lvl)) self.numbering.apply_markup(numbered, self.body, self.styles, self.object_map) self.apply_frames() if len(self.body) > 0: self.body.text = '\n\t' for child in self.body: child.tail = '\n\t' self.body[-1].tail = '\n' self.styles.generate_classes() for html_obj, obj in self.object_map.iteritems(): style = self.styles.resolve(obj) if style is not None: css = style.css if css: cls = self.styles.class_name(css) if cls: html_obj.set('class', cls) for html_obj, css in self.framed_map.iteritems(): cls = self.styles.class_name(css) if cls: html_obj.set('class', cls) if notes_header is not None: for h in children(self.body, 'h1', 'h2', 'h3'): notes_header.tag = h.tag cls = h.get('class', None) if cls and cls != 'notes-header': notes_header.set('class', '%s notes-header' % cls) break return self.write()
def __call__(self): doc = self.docx.document relationships_by_id, relationships_by_type = self.docx.document_relationships self.fields(doc, self.log) self.read_styles(relationships_by_type) self.images(relationships_by_id) self.layers = OrderedDict() self.framed = [[]] self.framed_map = {} self.anchor_map = {} self.link_map = defaultdict(list) self.link_source_map = {} paras = [] self.log.debug('Converting Word markup to HTML') self.read_page_properties(doc) self.current_rels = relationships_by_id for wp, page_properties in self.page_map.iteritems(): self.current_page = page_properties if wp.tag.endswith('}p'): p = self.convert_p(wp) self.body.append(p) paras.append(wp) self.read_block_anchors(doc) self.styles.apply_contextual_spacing(paras) # Apply page breaks at the start of every section, except the first # section (since that will be the start of the file) self.styles.apply_section_page_breaks(self.section_starts[1:]) notes_header = None orig_rid_map = self.images.rid_map if self.footnotes.has_notes: dl = DL() dl.set('class', 'notes') self.body.append(H1(self.notes_text)) notes_header = self.body[-1] notes_header.set('class', 'notes-header') self.body.append(dl) for anchor, text, note in self.footnotes: dl.append(DT('[', A('←' + text, href='#back_%s' % anchor, title=text), id=anchor)) dl[-1][0].tail = ']' dl.append(DD()) paras = [] self.images.rid_map = self.current_rels = note.rels[0] for wp in note: if wp.tag.endswith('}tbl'): self.tables.register(wp, self.styles) self.page_map[wp] = self.current_page else: p = self.convert_p(wp) dl[-1].append(p) paras.append(wp) self.styles.apply_contextual_spacing(paras) for p, wp in self.object_map.iteritems(): if len(p) > 0 and not p.text and len(p[0]) > 0 and not p[0].text and p[0][0].get('class', None) == 'tab': # Paragraph uses tabs for indentation, convert to text-indent parent = p[0] tabs = [] for child in parent: if child.get('class', None) == 'tab': tabs.append(child) if child.tail: break else: break indent = len(tabs) * self.settings.default_tab_stop style = self.styles.resolve(wp) if style.text_indent is inherit or (hasattr(style.text_indent, 'endswith') and style.text_indent.endswith('pt')): if style.text_indent is not inherit: indent = float(style.text_indent[:-2]) + indent style.text_indent = '%.3gpt' % indent parent.text = tabs[-1].tail or '' map(parent.remove, tabs) self.images.rid_map = orig_rid_map self.resolve_links() self.styles.cascade(self.layers) self.tables.apply_markup(self.object_map, self.page_map) numbered = [] for html_obj, obj in self.object_map.iteritems(): raw = obj.get('calibre_num_id', None) if raw is not None: lvl, num_id = raw.partition(':')[0::2] try: lvl = int(lvl) except (TypeError, ValueError): lvl = 0 numbered.append((html_obj, num_id, lvl)) self.numbering.apply_markup(numbered, self.body, self.styles, self.object_map, self.images) self.apply_frames() if len(self.body) > 0: self.body.text = '\n\t' for child in self.body: child.tail = '\n\t' self.body[-1].tail = '\n' self.log.debug('Converting styles to CSS') self.styles.generate_classes() for html_obj, obj in self.object_map.iteritems(): style = self.styles.resolve(obj) if style is not None: css = style.css if css: cls = self.styles.class_name(css) if cls: html_obj.set('class', cls) for html_obj, css in self.framed_map.iteritems(): cls = self.styles.class_name(css) if cls: html_obj.set('class', cls) if notes_header is not None: for h in children(self.body, 'h1', 'h2', 'h3'): notes_header.tag = h.tag cls = h.get('class', None) if cls and cls != 'notes-header': notes_header.set('class', '%s notes-header' % cls) break self.fields.polish_markup(self.object_map) self.log.debug('Cleaning up redundant markup generated by Word') self.cover_image = cleanup_markup(self.log, self.html, self.styles, self.dest_dir, self.detect_cover) return self.write(doc)