def read_page_properties(self, doc): current = [] self.page_map = OrderedDict() in_table = False for p in descendants(doc, 'w:p', 'w:tbl'): if p.tag.endswith('}tbl'): in_table = True self.tables.register(p) continue sect = tuple(descendants(p, 'w:sectPr')) if sect: pr = PageProperties(sect) for x in current + [p]: self.page_map[x] = pr current = [] else: current.append(p) if in_table: if ancestor(p, 'w:tbl') is not None: self.tables.add(p) else: in_table = False if current: last = XPath('./w:body/w:sectPr')(doc) pr = PageProperties(last) for x in current: self.page_map[x] = pr
def read_page_properties(self, doc): current = [] self.page_map = OrderedDict() in_table = False for p in descendants(doc, "w:p", "w:tbl"): if p.tag.endswith("}tbl"): in_table = True self.tables.register(p) continue sect = tuple(descendants(p, "w:sectPr")) if sect: pr = PageProperties(sect) for x in current + [p]: self.page_map[x] = pr current = [] else: current.append(p) if in_table: if ancestor(p, "w:tbl") is not None: self.tables.add(p) else: in_table = False if current: last = XPath("./w:body/w:sectPr")(doc) pr = PageProperties(last) for x in current: self.page_map[x] = pr
def read_page_properties(self, doc): current = [] self.page_map = OrderedDict() self.section_starts = [] for p in descendants(doc, 'w:p', 'w:tbl'): if p.tag.endswith('}tbl'): self.tables.register(p, self.styles) current.append(p) continue sect = tuple(descendants(p, 'w:sectPr')) if sect: pr = PageProperties(sect) paras = current + [p] for x in paras: self.page_map[x] = pr self.section_starts.append(paras[0]) current = [] else: current.append(p) if current: self.section_starts.append(current[0]) last = XPath('./w:body/w:sectPr')(doc) pr = PageProperties(last) for x in current: self.page_map[x] = pr
def read_block_anchors(self, doc): doc_anchors = frozenset(XPath('./w:body/w:bookmarkStart[@w:name]')(doc)) if doc_anchors: current_bm = None rmap = {v:k for k, v in self.object_map.iteritems()} for p in descendants(doc, 'w:p', 'w:bookmarkStart[@w:name]'): if p.tag.endswith('}p'): if current_bm and p in rmap: para = rmap[p] if 'id' not in para.attrib: para.set('id', generate_anchor(current_bm, frozenset(self.anchor_map.itervalues()))) self.anchor_map[current_bm] = para.get('id') current_bm = None elif p in doc_anchors: current_bm = get(p, 'w:name')
def from_headings(body, log): ' Create a TOC from headings in the document ' headings = ('h1', 'h2', 'h3') tocroot = TOC() xpaths = [XPath('//%s' % x) for x in headings] level_prev = {i + 1: None for i in xrange(len(xpaths))} level_prev[0] = tocroot level_item_map = { i + 1: frozenset(xp(body)) for i, xp in enumerate(xpaths) } item_level_map = { e: i for i, elems in level_item_map.iteritems() for e in elems } idcount = Count() def ensure_id(elem): ans = elem.get('id', None) if not ans: idcount.val += 1 ans = 'toc_id_%d' % idcount.val elem.set('id', ans) return ans for item in descendants(body, *headings): lvl = plvl = item_level_map.get(item, None) if lvl is None: continue parent = None while parent is None: plvl -= 1 parent = level_prev[plvl] lvl = plvl + 1 elem_id = ensure_id(item) text = elem_to_toc_text(item) toc = parent.add_item('index.html', elem_id, text) level_prev[lvl] = toc for i in xrange(lvl + 1, len(xpaths) + 1): level_prev[i] = None if len(tuple(tocroot.flat())) > 1: log('Generating Table of Contents from headings') return tocroot
def create_toc(self): ' Create a TOC from headings in the document ' root = self.body headings = ('h1', 'h2', 'h3') tocroot = TOC() xpaths = [XPath('//%s' % x) for x in headings] level_prev = {i+1:None for i in xrange(len(xpaths))} level_prev[0] = tocroot level_item_map = {i+1:frozenset(xp(root)) for i, xp in enumerate(xpaths)} item_level_map = {e:i for i, elems in level_item_map.iteritems() for e in elems} self.idcount = 0 def ensure_id(elem): ans = elem.get('id', None) if not ans: self.idcount += 1 ans = 'toc_id_%d' % self.idcount elem.set('id', ans) return ans for item in descendants(root, *headings): lvl = plvl = item_level_map.get(item, None) if lvl is None: continue parent = None while parent is None: plvl -= 1 parent = level_prev[plvl] lvl = plvl + 1 elem_id = ensure_id(item) text = elem_to_toc_text(item) toc = parent.add_item('index.html', elem_id, text) level_prev[lvl] = toc for i in xrange(lvl+1, len(xpaths)+1): level_prev[i] = None if len(tuple(tocroot.flat())) > 1: return tocroot
def convert_p(self, p): dest = P() self.object_map[dest] = p style = self.styles.resolve_paragraph(p) self.layers[p] = [] self.add_frame(dest, style.frame) current_anchor = None current_hyperlink = None for x in descendants(p, "w:r", "w:bookmarkStart", "w:hyperlink"): if x.tag.endswith("}r"): span = self.convert_run(x) if current_anchor is not None: (dest if len(dest) == 0 else span).set("id", current_anchor) current_anchor = None if current_hyperlink is not None: hl = ancestor(x, "w:hyperlink") if hl is not None: self.link_map[hl].append(span) else: current_hyperlink = None dest.append(span) self.layers[p].append(x) elif x.tag.endswith("}bookmarkStart"): anchor = get(x, "w:name") if anchor and anchor not in self.anchor_map: self.anchor_map[anchor] = current_anchor = generate_anchor( anchor, frozenset(self.anchor_map.itervalues()) ) elif x.tag.endswith("}hyperlink"): current_hyperlink = x m = re.match(r"heading\s+(\d+)$", style.style_name or "", re.IGNORECASE) if m is not None: n = min(6, max(1, int(m.group(1)))) dest.tag = "h%d" % n if style.direction == "rtl": dest.set("dir", "rtl") border_runs = [] common_borders = [] for span in dest: run = self.object_map[span] style = self.styles.resolve_run(run) if not border_runs or border_runs[-1][1].same_border(style): border_runs.append((span, style)) elif border_runs: if len(border_runs) > 1: common_borders.append(border_runs) border_runs = [] for border_run in common_borders: spans = [] bs = {} for span, style in border_run: style.get_border_css(bs) style.clear_border_css() spans.append(span) if bs: cls = self.styles.register(bs, "text_border") wrapper = self.wrap_elems(spans, SPAN()) wrapper.set("class", cls) return dest
def convert_p(self, p): dest = P() self.object_map[dest] = p style = self.styles.resolve_paragraph(p) self.layers[p] = [] self.add_frame(dest, style.frame) current_anchor = None current_hyperlink = None hl_xpath = XPath('ancestor::w:hyperlink[1]') def p_parent(x): # Ensure that nested <w:p> tags are handled. These can occur if a # textbox is present inside a paragraph. while True: x = x.getparent() try: if x.tag.endswith('}p'): return x except AttributeError: break for x in descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink'): if p_parent(x) is not p: continue if x.tag.endswith('}r'): span = self.convert_run(x) if current_anchor is not None: (dest if len(dest) == 0 else span).set( 'id', current_anchor) current_anchor = None if current_hyperlink is not None: try: hl = hl_xpath(x)[0] self.link_map[hl].append(span) self.link_source_map[hl] = self.current_rels x.set('is-link', '1') except IndexError: current_hyperlink = None dest.append(span) self.layers[p].append(x) elif x.tag.endswith('}bookmarkStart'): anchor = get(x, 'w:name') if anchor and anchor not in self.anchor_map: old_anchor = current_anchor self.anchor_map[anchor] = current_anchor = generate_anchor( anchor, frozenset(self.anchor_map.itervalues())) if old_anchor is not None: # The previous anchor was not applied to any element for a, t in tuple(self.anchor_map.iteritems()): if t == old_anchor: self.anchor_map[a] = current_anchor elif x.tag.endswith('}hyperlink'): current_hyperlink = x m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE) if m is not None: n = min(6, max(1, int(m.group(1)))) dest.tag = 'h%d' % n if style.direction == 'rtl': dest.set('dir', 'rtl') border_runs = [] common_borders = [] for span in dest: run = self.object_map[span] style = self.styles.resolve_run(run) if not border_runs or border_runs[-1][1].same_border(style): border_runs.append((span, style)) elif border_runs: if len(border_runs) > 1: common_borders.append(border_runs) border_runs = [] for border_run in common_borders: spans = [] bs = {} for span, style in border_run: style.get_border_css(bs) style.clear_border_css() spans.append(span) if bs: cls = self.styles.register(bs, 'text_border') wrapper = self.wrap_elems(spans, SPAN()) wrapper.set('class', cls) if not dest.text and len(dest) == 0: # Empty paragraph add a non-breaking space so that it is rendered # by WebKit dest.text = NBSP # If the last element in a block is a <br> the <br> is not rendered in # HTML, unless it is followed by a trailing space. Word, on the other # hand inserts a blank line for trailing <br>s. if len(dest) > 0 and not dest[-1].tail: if dest[-1].tag == 'br': dest[-1].tail = NBSP elif len(dest[-1]) > 0 and dest[-1][ -1].tag == 'br' and not dest[-1][-1].tail: dest[-1][-1].tail = NBSP return dest
def convert_p(self, p): dest = P() self.object_map[dest] = p style = self.styles.resolve_paragraph(p) self.layers[p] = [] self.add_frame(dest, style.frame) current_anchor = None current_hyperlink = None hl_xpath = XPath('ancestor::w:hyperlink[1]') def p_parent(x): # Ensure that nested <w:p> tags are handled. These can occur if a # textbox is present inside a paragraph. while True: x = x.getparent() try: if x.tag.endswith('}p'): return x except AttributeError: break for x in descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink'): if p_parent(x) is not p: continue if x.tag.endswith('}r'): span = self.convert_run(x) if current_anchor is not None: (dest if len(dest) == 0 else span).set('id', current_anchor) current_anchor = None if current_hyperlink is not None: try: hl = hl_xpath(x)[0] self.link_map[hl].append(span) self.link_source_map[hl] = self.current_rels x.set('is-link', '1') except IndexError: current_hyperlink = None dest.append(span) self.layers[p].append(x) elif x.tag.endswith('}bookmarkStart'): anchor = get(x, 'w:name') if anchor and anchor not in self.anchor_map: old_anchor = current_anchor self.anchor_map[anchor] = current_anchor = generate_anchor(anchor, frozenset(self.anchor_map.itervalues())) if old_anchor is not None: # The previous anchor was not applied to any element for a, t in tuple(self.anchor_map.iteritems()): if t == old_anchor: self.anchor_map[a] = current_anchor elif x.tag.endswith('}hyperlink'): current_hyperlink = x m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE) if m is not None: n = min(6, max(1, int(m.group(1)))) dest.tag = 'h%d' % n if style.direction == 'rtl': dest.set('dir', 'rtl') border_runs = [] common_borders = [] for span in dest: run = self.object_map[span] style = self.styles.resolve_run(run) if not border_runs or border_runs[-1][1].same_border(style): border_runs.append((span, style)) elif border_runs: if len(border_runs) > 1: common_borders.append(border_runs) border_runs = [] for border_run in common_borders: spans = [] bs = {} for span, style in border_run: style.get_border_css(bs) style.clear_border_css() spans.append(span) if bs: cls = self.styles.register(bs, 'text_border') wrapper = self.wrap_elems(spans, SPAN()) wrapper.set('class', cls) if not dest.text and len(dest) == 0: # Empty paragraph add a non-breaking space so that it is rendered # by WebKit dest.text = NBSP # If the last element in a block is a <br> the <br> is not rendered in # HTML, unless it is followed by a trailing space. Word, on the other # hand inserts a blank line for trailing <br>s. if len(dest) > 0 and not dest[-1].tail: if dest[-1].tag == 'br': dest[-1].tail = NBSP elif len(dest[-1]) > 0 and dest[-1][-1].tag == 'br' and not dest[-1][-1].tail: dest[-1][-1].tail = NBSP return dest
def __iter__(self): for p in descendants(self.parent, 'w:p', 'w:tbl'): yield p
def convert_p(self, p): dest = P() self.object_map[dest] = p style = self.styles.resolve_paragraph(p) self.layers[p] = [] self.add_frame(dest, style.frame) current_anchor = None current_hyperlink = None for x in descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink'): if x.tag.endswith('}r'): span = self.convert_run(x) if current_anchor is not None: (dest if len(dest) == 0 else span).set( 'id', current_anchor) current_anchor = None if current_hyperlink is not None: hl = ancestor(x, 'w:hyperlink') if hl is not None: self.link_map[hl].append(span) else: current_hyperlink = None dest.append(span) self.layers[p].append(x) elif x.tag.endswith('}bookmarkStart'): anchor = get(x, 'w:name') if anchor and anchor not in self.anchor_map: self.anchor_map[anchor] = current_anchor = generate_anchor( anchor, frozenset(self.anchor_map.itervalues())) elif x.tag.endswith('}hyperlink'): current_hyperlink = x m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE) if m is not None: n = min(6, max(1, int(m.group(1)))) dest.tag = 'h%d' % n if style.direction == 'rtl': dest.set('dir', 'rtl') border_runs = [] common_borders = [] for span in dest: run = self.object_map[span] style = self.styles.resolve_run(run) if not border_runs or border_runs[-1][1].same_border(style): border_runs.append((span, style)) elif border_runs: if len(border_runs) > 1: common_borders.append(border_runs) border_runs = [] for border_run in common_borders: spans = [] bs = {} for span, style in border_run: style.get_border_css(bs) style.clear_border_css() spans.append(span) if bs: cls = self.styles.register(bs, 'text_border') wrapper = self.wrap_elems(spans, SPAN()) wrapper.set('class', cls) return dest
def convert_p(self, p): dest = P() self.object_map[dest] = p style = self.styles.resolve_paragraph(p) self.layers[p] = [] self.add_frame(dest, style.frame) current_anchor = None current_hyperlink = None hl_xpath = XPath('ancestor::w:hyperlink[1]') for x in descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink'): if x.tag.endswith('}r'): span = self.convert_run(x) if current_anchor is not None: (dest if len(dest) == 0 else span).set('id', current_anchor) current_anchor = None if current_hyperlink is not None: try: hl = hl_xpath(x)[0] self.link_map[hl].append(span) except IndexError: current_hyperlink = None dest.append(span) self.layers[p].append(x) elif x.tag.endswith('}bookmarkStart'): anchor = get(x, 'w:name') if anchor and anchor not in self.anchor_map: self.anchor_map[anchor] = current_anchor = generate_anchor(anchor, frozenset(self.anchor_map.itervalues())) elif x.tag.endswith('}hyperlink'): current_hyperlink = x m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE) if m is not None: n = min(6, max(1, int(m.group(1)))) dest.tag = 'h%d' % n if style.direction == 'rtl': dest.set('dir', 'rtl') border_runs = [] common_borders = [] for span in dest: run = self.object_map[span] style = self.styles.resolve_run(run) if not border_runs or border_runs[-1][1].same_border(style): border_runs.append((span, style)) elif border_runs: if len(border_runs) > 1: common_borders.append(border_runs) border_runs = [] for border_run in common_borders: spans = [] bs = {} for span, style in border_run: style.get_border_css(bs) style.clear_border_css() spans.append(span) if bs: cls = self.styles.register(bs, 'text_border') wrapper = self.wrap_elems(spans, SPAN()) wrapper.set('class', cls) if not dest.text and len(dest) == 0: # Empty paragraph add a non-breaking space so that it is rendered # by WebKit dest.text = '\xa0' return dest
def convert_p(self, p): dest = P() self.object_map[dest] = p style = self.styles.resolve_paragraph(p) self.layers[p] = [] self.add_frame(dest, style.frame) current_anchor = None current_hyperlink = None hl_xpath = XPath('ancestor::w:hyperlink[1]') for x in descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink'): if x.tag.endswith('}r'): span = self.convert_run(x) if current_anchor is not None: (dest if len(dest) == 0 else span).set('id', current_anchor) current_anchor = None if current_hyperlink is not None: try: hl = hl_xpath(x)[0] self.link_map[hl].append(span) x.set('is-link', '1') except IndexError: current_hyperlink = None dest.append(span) self.layers[p].append(x) elif x.tag.endswith('}bookmarkStart'): anchor = get(x, 'w:name') if anchor and anchor not in self.anchor_map: old_anchor = current_anchor self.anchor_map[anchor] = current_anchor = generate_anchor(anchor, frozenset(self.anchor_map.itervalues())) if old_anchor is not None: # The previous anchor was not applied to any element for a, t in tuple(self.anchor_map.iteritems()): if t == old_anchor: self.anchor_map[a] = current_anchor elif x.tag.endswith('}hyperlink'): current_hyperlink = x m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE) if m is not None: n = min(6, max(1, int(m.group(1)))) dest.tag = 'h%d' % n if style.direction == 'rtl': dest.set('dir', 'rtl') border_runs = [] common_borders = [] for span in dest: run = self.object_map[span] style = self.styles.resolve_run(run) if not border_runs or border_runs[-1][1].same_border(style): border_runs.append((span, style)) elif border_runs: if len(border_runs) > 1: common_borders.append(border_runs) border_runs = [] for border_run in common_borders: spans = [] bs = {} for span, style in border_run: style.get_border_css(bs) style.clear_border_css() spans.append(span) if bs: cls = self.styles.register(bs, 'text_border') wrapper = self.wrap_elems(spans, SPAN()) wrapper.set('class', cls) if not dest.text and len(dest) == 0: # Empty paragraph add a non-breaking space so that it is rendered # by WebKit dest.text = '\xa0' return dest