def read_page_properties(self, doc): current = [] self.page_map = OrderedDict() in_table = False for p in descendants(doc, 'w:p', 'w:tbl'): if p.tag.endswith('}tbl'): in_table = True self.tables.register(p) continue sect = tuple(descendants(p, 'w:sectPr')) if sect: pr = PageProperties(sect) for x in current + [p]: self.page_map[x] = pr current = [] else: current.append(p) if in_table: if ancestor(p, 'w:tbl') is not None: self.tables.add(p) else: in_table = False if current: last = XPath('./w:body/w:sectPr')(doc) pr = PageProperties(last) for x in current: self.page_map[x] = pr
def read_page_properties(self, doc): current = [] self.page_map = OrderedDict() in_table = False for p in descendants(doc, "w:p", "w:tbl"): if p.tag.endswith("}tbl"): in_table = True self.tables.register(p) continue sect = tuple(descendants(p, "w:sectPr")) if sect: pr = PageProperties(sect) for x in current + [p]: self.page_map[x] = pr current = [] else: current.append(p) if in_table: if ancestor(p, "w:tbl") is not None: self.tables.add(p) else: in_table = False if current: last = XPath("./w:body/w:sectPr")(doc) pr = PageProperties(last) for x in current: self.page_map[x] = pr
def from_toc(docx, link_map, styles, object_map): toc_level = None level = 0 TI = namedtuple('TI', 'text anchor indent') toc = [] for tag in XPath('//*[(@w:fldCharType and name()="w:fldChar") or name()="w:hyperlink" or name()="w:instrText"]')(docx): n = tag.tag.rpartition('}')[-1] if n == 'fldChar': t = get(tag, 'w:fldCharType') if t == 'begin': level += 1 elif t == 'end': level -= 1 if toc_level is not None and level < toc_level: break elif n == 'instrText': if level > 0 and tag.text and tag.text.strip().startswith('TOC '): toc_level = level elif n == 'hyperlink': if toc_level is not None and level >= toc_level and tag in link_map: a = link_map[tag] href = a.get('href', None) txt = link_to_txt(a, styles, object_map) p = ancestor(tag, 'w:p') if txt and href and p is not None: ps = styles.resolve_paragraph(p) try: ml = int(ps.margin_left[:-2]) except (TypeError, ValueError, AttributeError): ml = 0 if ps.text_align in {'center', 'right'}: ml = 0 toc.append(TI(txt, href[1:], ml)) if toc: return structure_toc(toc)
def from_toc(docx, link_map, styles, object_map, log): toc_level = None level = 0 TI = namedtuple('TI', 'text anchor indent') toc = [] for tag in XPath( '//*[(@w:fldCharType and name()="w:fldChar") or name()="w:hyperlink" or name()="w:instrText"]' )(docx): n = tag.tag.rpartition('}')[-1] if n == 'fldChar': t = get(tag, 'w:fldCharType') if t == 'begin': level += 1 elif t == 'end': level -= 1 if toc_level is not None and level < toc_level: break elif n == 'instrText': if level > 0 and tag.text and tag.text.strip().startswith('TOC '): toc_level = level elif n == 'hyperlink': if toc_level is not None and level >= toc_level and tag in link_map: a = link_map[tag] href = a.get('href', None) txt = link_to_txt(a, styles, object_map) p = ancestor(tag, 'w:p') if txt and href and p is not None: ps = styles.resolve_paragraph(p) try: ml = int(ps.margin_left[:-2]) except (TypeError, ValueError, AttributeError): ml = 0 if ps.text_align in {'center', 'right'}: ml = 0 toc.append(TI(txt, href[1:], ml)) if toc: log('Found Word Table of Contents, using it to generate the Table of Contents' ) return structure_toc(toc)
def __call__(self): doc = self.docx.document relationships_by_id, relationships_by_type = self.docx.document_relationships self.read_styles(relationships_by_type) self.images(relationships_by_id) self.layers = OrderedDict() self.framed = [[]] self.framed_map = {} self.anchor_map = {} self.link_map = defaultdict(list) self.read_page_properties(doc) for wp, page_properties in self.page_map.iteritems(): self.current_page = page_properties p = self.convert_p(wp) self.body.append(p) notes_header = None if self.footnotes.has_notes: dl = DL() dl.set("class", "notes") self.body.append(H1(self.notes_text)) notes_header = self.body[-1] notes_header.set("class", "notes-header") self.body.append(dl) for anchor, text, note in self.footnotes: dl.append(DT("[", A("←" + text, href="#back_%s" % anchor, title=text), id=anchor)) dl[-1][0].tail = "]" dl.append(DD()) in_table = False for wp in note: if wp.tag.endswith("}tbl"): self.tables.register(wp) in_table = True continue if in_table: if ancestor(wp, "w:tbl") is not None: self.tables.add(wp) else: in_table = False p = self.convert_p(wp) dl[-1].append(p) self.resolve_links(relationships_by_id) self.styles.cascade(self.layers) self.tables.apply_markup(self.object_map) numbered = [] for html_obj, obj in self.object_map.iteritems(): raw = obj.get("calibre_num_id", None) if raw is not None: lvl, num_id = raw.partition(":")[0::2] try: lvl = int(lvl) except (TypeError, ValueError): lvl = 0 numbered.append((html_obj, num_id, lvl)) self.numbering.apply_markup(numbered, self.body, self.styles, self.object_map) self.apply_frames() if len(self.body) > 0: self.body.text = "\n\t" for child in self.body: child.tail = "\n\t" self.body[-1].tail = "\n" self.styles.generate_classes() for html_obj, obj in self.object_map.iteritems(): style = self.styles.resolve(obj) if style is not None: css = style.css if css: cls = self.styles.class_name(css) if cls: html_obj.set("class", cls) for html_obj, css in self.framed_map.iteritems(): cls = self.styles.class_name(css) if cls: html_obj.set("class", cls) if notes_header is not None: for h in self.body.iterchildren("h1", "h2", "h3"): notes_header.tag = h.tag cls = h.get("class", None) if cls and cls != "notes-header": notes_header.set("class", "%s notes-header" % cls) break return self.write()
def convert_p(self, p): dest = P() self.object_map[dest] = p style = self.styles.resolve_paragraph(p) self.layers[p] = [] self.add_frame(dest, style.frame) current_anchor = None current_hyperlink = None for x in descendants(p, "w:r", "w:bookmarkStart", "w:hyperlink"): if x.tag.endswith("}r"): span = self.convert_run(x) if current_anchor is not None: (dest if len(dest) == 0 else span).set("id", current_anchor) current_anchor = None if current_hyperlink is not None: hl = ancestor(x, "w:hyperlink") if hl is not None: self.link_map[hl].append(span) else: current_hyperlink = None dest.append(span) self.layers[p].append(x) elif x.tag.endswith("}bookmarkStart"): anchor = get(x, "w:name") if anchor and anchor not in self.anchor_map: self.anchor_map[anchor] = current_anchor = generate_anchor( anchor, frozenset(self.anchor_map.itervalues()) ) elif x.tag.endswith("}hyperlink"): current_hyperlink = x m = re.match(r"heading\s+(\d+)$", style.style_name or "", re.IGNORECASE) if m is not None: n = min(6, max(1, int(m.group(1)))) dest.tag = "h%d" % n if style.direction == "rtl": dest.set("dir", "rtl") border_runs = [] common_borders = [] for span in dest: run = self.object_map[span] style = self.styles.resolve_run(run) if not border_runs or border_runs[-1][1].same_border(style): border_runs.append((span, style)) elif border_runs: if len(border_runs) > 1: common_borders.append(border_runs) border_runs = [] for border_run in common_borders: spans = [] bs = {} for span, style in border_run: style.get_border_css(bs) style.clear_border_css() spans.append(span) if bs: cls = self.styles.register(bs, "text_border") wrapper = self.wrap_elems(spans, SPAN()) wrapper.set("class", cls) return dest
def __call__(self): doc = self.docx.document relationships_by_id, relationships_by_type = self.docx.document_relationships self.read_styles(relationships_by_type) self.images(relationships_by_id) self.layers = OrderedDict() self.framed = [[]] self.framed_map = {} self.anchor_map = {} self.link_map = defaultdict(list) self.read_page_properties(doc) for wp, page_properties in self.page_map.iteritems(): self.current_page = page_properties p = self.convert_p(wp) self.body.append(p) notes_header = None if self.footnotes.has_notes: dl = DL() dl.set('class', 'notes') self.body.append(H1(self.notes_text)) notes_header = self.body[-1] notes_header.set('class', 'notes-header') self.body.append(dl) for anchor, text, note in self.footnotes: dl.append( DT('[', A('←' + text, href='#back_%s' % anchor, title=text), id=anchor)) dl[-1][0].tail = ']' dl.append(DD()) in_table = False for wp in note: if wp.tag.endswith('}tbl'): self.tables.register(wp) in_table = True continue if in_table: if ancestor(wp, 'w:tbl') is not None: self.tables.add(wp) else: in_table = False p = self.convert_p(wp) dl[-1].append(p) self.resolve_links(relationships_by_id) self.styles.cascade(self.layers) self.tables.apply_markup(self.object_map) numbered = [] for html_obj, obj in self.object_map.iteritems(): raw = obj.get('calibre_num_id', None) if raw is not None: lvl, num_id = raw.partition(':')[0::2] try: lvl = int(lvl) except (TypeError, ValueError): lvl = 0 numbered.append((html_obj, num_id, lvl)) self.numbering.apply_markup(numbered, self.body, self.styles, self.object_map) self.apply_frames() if len(self.body) > 0: self.body.text = '\n\t' for child in self.body: child.tail = '\n\t' self.body[-1].tail = '\n' self.styles.generate_classes() for html_obj, obj in self.object_map.iteritems(): style = self.styles.resolve(obj) if style is not None: css = style.css if css: cls = self.styles.class_name(css) if cls: html_obj.set('class', cls) for html_obj, css in self.framed_map.iteritems(): cls = self.styles.class_name(css) if cls: html_obj.set('class', cls) if notes_header is not None: for h in self.body.iterchildren('h1', 'h2', 'h3'): notes_header.tag = h.tag cls = h.get('class', None) if cls and cls != 'notes-header': notes_header.set('class', '%s notes-header' % cls) break return self.write()
def convert_p(self, p): dest = P() self.object_map[dest] = p style = self.styles.resolve_paragraph(p) self.layers[p] = [] self.add_frame(dest, style.frame) current_anchor = None current_hyperlink = None for x in descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink'): if x.tag.endswith('}r'): span = self.convert_run(x) if current_anchor is not None: (dest if len(dest) == 0 else span).set( 'id', current_anchor) current_anchor = None if current_hyperlink is not None: hl = ancestor(x, 'w:hyperlink') if hl is not None: self.link_map[hl].append(span) else: current_hyperlink = None dest.append(span) self.layers[p].append(x) elif x.tag.endswith('}bookmarkStart'): anchor = get(x, 'w:name') if anchor and anchor not in self.anchor_map: self.anchor_map[anchor] = current_anchor = generate_anchor( anchor, frozenset(self.anchor_map.itervalues())) elif x.tag.endswith('}hyperlink'): current_hyperlink = x m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE) if m is not None: n = min(6, max(1, int(m.group(1)))) dest.tag = 'h%d' % n if style.direction == 'rtl': dest.set('dir', 'rtl') border_runs = [] common_borders = [] for span in dest: run = self.object_map[span] style = self.styles.resolve_run(run) if not border_runs or border_runs[-1][1].same_border(style): border_runs.append((span, style)) elif border_runs: if len(border_runs) > 1: common_borders.append(border_runs) border_runs = [] for border_run in common_borders: spans = [] bs = {} for span, style in border_run: style.get_border_css(bs) style.clear_border_css() spans.append(span) if bs: cls = self.styles.register(bs, 'text_border') wrapper = self.wrap_elems(spans, SPAN()) wrapper.set('class', cls) return dest
def cleanup_markup(log, root, styles, dest_dir, detect_cover): # Move <hr>s outside paragraphs, if possible. for hr in root.xpath('//span/hr'): p = ancestor(hr, 'p') descendants = tuple(p.iterdescendants()) if descendants[-1] is hr: parent = p.getparent() idx = parent.index(p) parent.insert(idx+1, hr) hr.tail = '\n\t' # Merge consecutive spans that have the same styling current_run = [] for span in root.xpath('//span'): if not current_run: current_run.append(span) else: last = current_run[-1] if mergeable(last, span): current_run.append(span) else: if len(current_run) > 1: merge_run(current_run) current_run = [span] # Remove unnecessary span tags that are the only child of a parent block # element class_map = dict(styles.classes.itervalues()) parents = ('p', 'div') + tuple('h%d' % i for i in xrange(1, 7)) for parent in root.xpath('//*[(%s) and count(span)=1]' % ' or '.join('name()="%s"' % t for t in parents)): if len(parent) == 1 and not parent.text and not parent[0].tail and not parent[0].get('id', None): # We have a block whose contents are entirely enclosed in a <span> span = parent[0] span_class = span.get('class', None) span_css = class_map.get(span_class, {}) if liftable(span_css): pclass = parent.get('class', None) if span_class: pclass = (pclass + ' ' + span_class) if pclass else span_class parent.set('class', pclass) parent.text = span.text parent.remove(span) for child in span: parent.append(child) # Make spans whose only styling is bold or italic into <b> and <i> tags for span in root.xpath('//span[@class]'): css = class_map.get(span.get('class', None), {}) if len(css) == 1: if css == {'font-style':'italic'}: span.tag = 'i' del span.attrib['class'] elif css == {'font-weight':'bold'}: span.tag = 'b' del span.attrib['class'] # Get rid of <span>s that have no styling for span in root.xpath('//span[not(@class) and not(@id)]'): lift(span) if detect_cover: # Check if the first image in the document is possibly a cover img = root.xpath('//img[@src][1]') if img: img = img[0] path = os.path.join(dest_dir, img.get('src')) if os.path.exists(path) and before_count(root, img, limit=10) < 5: from calibre.utils.magick.draw import identify try: width, height, fmt = identify(path) except: width, height, fmt = 0, 0, None is_cover = 0.8 <= height/width <= 1.8 and height*width >= 160000 if is_cover: log.debug('Detected an image that looks like a cover') img.getparent().remove(img) return path