def read_block_anchors(self, doc): doc_anchors = frozenset(XPath('./w:body/w:bookmarkStart[@w:name]')(doc)) if doc_anchors: current_bm = None rmap = {v:k for k, v in self.object_map.iteritems()} for p in descendants(doc, 'w:p', 'w:bookmarkStart[@w:name]'): if p.tag.endswith('}p'): if current_bm and p in rmap: para = rmap[p] if 'id' not in para.attrib: para.set('id', generate_anchor(current_bm, frozenset(self.anchor_map.itervalues()))) self.anchor_map[current_bm] = para.get('id') current_bm = None elif p in doc_anchors: current_bm = get(p, 'w:name')
def read_block_anchors(self, doc): doc_anchors = frozenset(self.namespace.XPath('./w:body/w:bookmarkStart[@w:name]')(doc)) if doc_anchors: current_bm = set() rmap = {v:k for k, v in iteritems(self.object_map)} for p in self.namespace.descendants(doc, 'w:p', 'w:bookmarkStart[@w:name]'): if p.tag.endswith('}p'): if current_bm and p in rmap: para = rmap[p] if 'id' not in para.attrib: para.set('id', generate_anchor(next(iter(current_bm)), frozenset(itervalues(self.anchor_map)))) for name in current_bm: self.anchor_map[name] = para.get('id') current_bm = set() elif p in doc_anchors: anchor = self.namespace.get(p, 'w:name') if anchor: current_bm.add(anchor)
def read_block_anchors(self, doc): doc_anchors = frozenset(self.namespace.XPath('./w:body/w:bookmarkStart[@w:name]')(doc)) if doc_anchors: current_bm = set() rmap = {v:k for k, v in self.object_map.iteritems()} for p in self.namespace.descendants(doc, 'w:p', 'w:bookmarkStart[@w:name]'): if p.tag.endswith('}p'): if current_bm and p in rmap: para = rmap[p] if 'id' not in para.attrib: para.set('id', generate_anchor(next(iter(current_bm)), frozenset(self.anchor_map.itervalues()))) for name in current_bm: self.anchor_map[name] = para.get('id') current_bm = set() elif p in doc_anchors: anchor = self.namespace.get(p, 'w:name') if anchor: current_bm.add(anchor)
def convert_p(self, p): dest = P() self.object_map[dest] = p style = self.styles.resolve_paragraph(p) self.layers[p] = [] self.frame_map[p] = style.frame self.add_frame(dest, style.frame) current_anchor = None current_hyperlink = None hl_xpath = self.namespace.XPath('ancestor::w:hyperlink[1]') def p_parent(x): # Ensure that nested <w:p> tags are handled. These can occur if a # textbox is present inside a paragraph. while True: x = x.getparent() try: if x.tag.endswith('}p'): return x except AttributeError: break for x in self.namespace.descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink', 'w:instrText'): if p_parent(x) is not p: continue if x.tag.endswith('}r'): span = self.convert_run(x) if current_anchor is not None: (dest if len(dest) == 0 else span).set('id', current_anchor) current_anchor = None if current_hyperlink is not None: try: hl = hl_xpath(x)[0] self.link_map[hl].append(span) self.link_source_map[hl] = self.current_rels x.set('is-link', '1') except IndexError: current_hyperlink = None dest.append(span) self.layers[p].append(x) elif x.tag.endswith('}bookmarkStart'): anchor = self.namespace.get(x, 'w:name') if anchor and anchor not in self.anchor_map and anchor != '_GoBack': # _GoBack is a special bookmark inserted by Word 2010 for # the return to previous edit feature, we ignore it old_anchor = current_anchor self.anchor_map[anchor] = current_anchor = generate_anchor(anchor, frozenset(itervalues(self.anchor_map))) if old_anchor is not None: # The previous anchor was not applied to any element for a, t in tuple(iteritems(self.anchor_map)): if t == old_anchor: self.anchor_map[a] = current_anchor elif x.tag.endswith('}hyperlink'): current_hyperlink = x elif x.tag.endswith('}instrText') and x.text and x.text.strip().startswith('TOC '): old_anchor = current_anchor anchor = unicode_type(uuid.uuid4()) self.anchor_map[anchor] = current_anchor = generate_anchor('toc', frozenset(itervalues(self.anchor_map))) self.toc_anchor = current_anchor if old_anchor is not None: # The previous anchor was not applied to any element for a, t in tuple(iteritems(self.anchor_map)): if t == old_anchor: self.anchor_map[a] = current_anchor if current_anchor is not None: # This paragraph had no <w:r> descendants dest.set('id', current_anchor) current_anchor = None m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE) if m is not None: n = min(6, max(1, int(m.group(1)))) dest.tag = 'h%d' % n if style.bidi is True: dest.set('dir', 'rtl') border_runs = [] common_borders = [] for span in dest: run = self.object_map[span] style = self.styles.resolve_run(run) if not border_runs or border_runs[-1][1].same_border(style): border_runs.append((span, style)) elif border_runs: if len(border_runs) > 1: common_borders.append(border_runs) border_runs = [] for border_run in common_borders: spans = [] bs = {} for span, style in border_run: style.get_border_css(bs) style.clear_border_css() spans.append(span) if bs: cls = self.styles.register(bs, 'text_border') wrapper = self.wrap_elems(spans, SPAN()) wrapper.set('class', cls) if not dest.text and len(dest) == 0 and not style.has_visible_border(): # Empty paragraph add a non-breaking space so that it is rendered # by WebKit dest.text = NBSP # If the last element in a block is a <br> the <br> is not rendered in # HTML, unless it is followed by a trailing space. Word, on the other # hand inserts a blank line for trailing <br>s. if len(dest) > 0 and not dest[-1].tail: if dest[-1].tag == 'br': dest[-1].tail = NBSP elif len(dest[-1]) > 0 and dest[-1][-1].tag == 'br' and not dest[-1][-1].tail: dest[-1][-1].tail = NBSP return dest
def convert_p(self, p): dest = P() self.object_map[dest] = p style = self.styles.resolve_paragraph(p) self.layers[p] = [] self.add_frame(dest, style.frame) current_anchor = None current_hyperlink = None for x in descendants(p, "w:r", "w:bookmarkStart", "w:hyperlink"): if x.tag.endswith("}r"): span = self.convert_run(x) if current_anchor is not None: (dest if len(dest) == 0 else span).set("id", current_anchor) current_anchor = None if current_hyperlink is not None: hl = ancestor(x, "w:hyperlink") if hl is not None: self.link_map[hl].append(span) else: current_hyperlink = None dest.append(span) self.layers[p].append(x) elif x.tag.endswith("}bookmarkStart"): anchor = get(x, "w:name") if anchor and anchor not in self.anchor_map: self.anchor_map[anchor] = current_anchor = generate_anchor( anchor, frozenset(self.anchor_map.itervalues()) ) elif x.tag.endswith("}hyperlink"): current_hyperlink = x m = re.match(r"heading\s+(\d+)$", style.style_name or "", re.IGNORECASE) if m is not None: n = min(6, max(1, int(m.group(1)))) dest.tag = "h%d" % n if style.direction == "rtl": dest.set("dir", "rtl") border_runs = [] common_borders = [] for span in dest: run = self.object_map[span] style = self.styles.resolve_run(run) if not border_runs or border_runs[-1][1].same_border(style): border_runs.append((span, style)) elif border_runs: if len(border_runs) > 1: common_borders.append(border_runs) border_runs = [] for border_run in common_borders: spans = [] bs = {} for span, style in border_run: style.get_border_css(bs) style.clear_border_css() spans.append(span) if bs: cls = self.styles.register(bs, "text_border") wrapper = self.wrap_elems(spans, SPAN()) wrapper.set("class", cls) return dest
def convert_p(self, p): dest = P() self.object_map[dest] = p style = self.styles.resolve_paragraph(p) self.layers[p] = [] self.add_frame(dest, style.frame) current_anchor = None current_hyperlink = None hl_xpath = XPath('ancestor::w:hyperlink[1]') def p_parent(x): # Ensure that nested <w:p> tags are handled. These can occur if a # textbox is present inside a paragraph. while True: x = x.getparent() try: if x.tag.endswith('}p'): return x except AttributeError: break for x in descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink'): if p_parent(x) is not p: continue if x.tag.endswith('}r'): span = self.convert_run(x) if current_anchor is not None: (dest if len(dest) == 0 else span).set('id', current_anchor) current_anchor = None if current_hyperlink is not None: try: hl = hl_xpath(x)[0] self.link_map[hl].append(span) self.link_source_map[hl] = self.current_rels x.set('is-link', '1') except IndexError: current_hyperlink = None dest.append(span) self.layers[p].append(x) elif x.tag.endswith('}bookmarkStart'): anchor = get(x, 'w:name') if anchor and anchor not in self.anchor_map: old_anchor = current_anchor self.anchor_map[anchor] = current_anchor = generate_anchor(anchor, frozenset(self.anchor_map.itervalues())) if old_anchor is not None: # The previous anchor was not applied to any element for a, t in tuple(self.anchor_map.iteritems()): if t == old_anchor: self.anchor_map[a] = current_anchor elif x.tag.endswith('}hyperlink'): current_hyperlink = x m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE) if m is not None: n = min(6, max(1, int(m.group(1)))) dest.tag = 'h%d' % n if style.direction == 'rtl': dest.set('dir', 'rtl') border_runs = [] common_borders = [] for span in dest: run = self.object_map[span] style = self.styles.resolve_run(run) if not border_runs or border_runs[-1][1].same_border(style): border_runs.append((span, style)) elif border_runs: if len(border_runs) > 1: common_borders.append(border_runs) border_runs = [] for border_run in common_borders: spans = [] bs = {} for span, style in border_run: style.get_border_css(bs) style.clear_border_css() spans.append(span) if bs: cls = self.styles.register(bs, 'text_border') wrapper = self.wrap_elems(spans, SPAN()) wrapper.set('class', cls) if not dest.text and len(dest) == 0: # Empty paragraph add a non-breaking space so that it is rendered # by WebKit dest.text = NBSP # If the last element in a block is a <br> the <br> is not rendered in # HTML, unless it is followed by a trailing space. Word, on the other # hand inserts a blank line for trailing <br>s. if len(dest) > 0 and not dest[-1].tail: if dest[-1].tag == 'br': dest[-1].tail = NBSP elif len(dest[-1]) > 0 and dest[-1][-1].tag == 'br' and not dest[-1][-1].tail: dest[-1][-1].tail = NBSP return dest
def convert_p(self, p): dest = P() self.object_map[dest] = p style = self.styles.resolve_paragraph(p) self.layers[p] = [] self.add_frame(dest, style.frame) current_anchor = None current_hyperlink = None for x in descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink'): if x.tag.endswith('}r'): span = self.convert_run(x) if current_anchor is not None: (dest if len(dest) == 0 else span).set( 'id', current_anchor) current_anchor = None if current_hyperlink is not None: hl = ancestor(x, 'w:hyperlink') if hl is not None: self.link_map[hl].append(span) else: current_hyperlink = None dest.append(span) self.layers[p].append(x) elif x.tag.endswith('}bookmarkStart'): anchor = get(x, 'w:name') if anchor and anchor not in self.anchor_map: self.anchor_map[anchor] = current_anchor = generate_anchor( anchor, frozenset(self.anchor_map.itervalues())) elif x.tag.endswith('}hyperlink'): current_hyperlink = x m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE) if m is not None: n = min(6, max(1, int(m.group(1)))) dest.tag = 'h%d' % n if style.direction == 'rtl': dest.set('dir', 'rtl') border_runs = [] common_borders = [] for span in dest: run = self.object_map[span] style = self.styles.resolve_run(run) if not border_runs or border_runs[-1][1].same_border(style): border_runs.append((span, style)) elif border_runs: if len(border_runs) > 1: common_borders.append(border_runs) border_runs = [] for border_run in common_borders: spans = [] bs = {} for span, style in border_run: style.get_border_css(bs) style.clear_border_css() spans.append(span) if bs: cls = self.styles.register(bs, 'text_border') wrapper = self.wrap_elems(spans, SPAN()) wrapper.set('class', cls) return dest
def convert_p(self, p): dest = P() self.object_map[dest] = p style = self.styles.resolve_paragraph(p) self.layers[p] = [] self.add_frame(dest, style.frame) current_anchor = None current_hyperlink = None hl_xpath = XPath('ancestor::w:hyperlink[1]') for x in descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink'): if x.tag.endswith('}r'): span = self.convert_run(x) if current_anchor is not None: (dest if len(dest) == 0 else span).set('id', current_anchor) current_anchor = None if current_hyperlink is not None: try: hl = hl_xpath(x)[0] self.link_map[hl].append(span) except IndexError: current_hyperlink = None dest.append(span) self.layers[p].append(x) elif x.tag.endswith('}bookmarkStart'): anchor = get(x, 'w:name') if anchor and anchor not in self.anchor_map: self.anchor_map[anchor] = current_anchor = generate_anchor(anchor, frozenset(self.anchor_map.itervalues())) elif x.tag.endswith('}hyperlink'): current_hyperlink = x m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE) if m is not None: n = min(6, max(1, int(m.group(1)))) dest.tag = 'h%d' % n if style.direction == 'rtl': dest.set('dir', 'rtl') border_runs = [] common_borders = [] for span in dest: run = self.object_map[span] style = self.styles.resolve_run(run) if not border_runs or border_runs[-1][1].same_border(style): border_runs.append((span, style)) elif border_runs: if len(border_runs) > 1: common_borders.append(border_runs) border_runs = [] for border_run in common_borders: spans = [] bs = {} for span, style in border_run: style.get_border_css(bs) style.clear_border_css() spans.append(span) if bs: cls = self.styles.register(bs, 'text_border') wrapper = self.wrap_elems(spans, SPAN()) wrapper.set('class', cls) if not dest.text and len(dest) == 0: # Empty paragraph add a non-breaking space so that it is rendered # by WebKit dest.text = '\xa0' return dest
def convert_p(self, p): dest = P() self.object_map[dest] = p style = self.styles.resolve_paragraph(p) self.layers[p] = [] self.add_frame(dest, style.frame) current_anchor = None current_hyperlink = None hl_xpath = XPath('ancestor::w:hyperlink[1]') for x in descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink'): if x.tag.endswith('}r'): span = self.convert_run(x) if current_anchor is not None: (dest if len(dest) == 0 else span).set( 'id', current_anchor) current_anchor = None if current_hyperlink is not None: try: hl = hl_xpath(x)[0] self.link_map[hl].append(span) self.link_source_map[hl] = self.current_rels x.set('is-link', '1') except IndexError: current_hyperlink = None dest.append(span) self.layers[p].append(x) elif x.tag.endswith('}bookmarkStart'): anchor = get(x, 'w:name') if anchor and anchor not in self.anchor_map: old_anchor = current_anchor self.anchor_map[anchor] = current_anchor = generate_anchor( anchor, frozenset(self.anchor_map.itervalues())) if old_anchor is not None: # The previous anchor was not applied to any element for a, t in tuple(self.anchor_map.iteritems()): if t == old_anchor: self.anchor_map[a] = current_anchor elif x.tag.endswith('}hyperlink'): current_hyperlink = x m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE) if m is not None: n = min(6, max(1, int(m.group(1)))) dest.tag = 'h%d' % n if style.direction == 'rtl': dest.set('dir', 'rtl') border_runs = [] common_borders = [] for span in dest: run = self.object_map[span] style = self.styles.resolve_run(run) if not border_runs or border_runs[-1][1].same_border(style): border_runs.append((span, style)) elif border_runs: if len(border_runs) > 1: common_borders.append(border_runs) border_runs = [] for border_run in common_borders: spans = [] bs = {} for span, style in border_run: style.get_border_css(bs) style.clear_border_css() spans.append(span) if bs: cls = self.styles.register(bs, 'text_border') wrapper = self.wrap_elems(spans, SPAN()) wrapper.set('class', cls) if not dest.text and len(dest) == 0: # Empty paragraph add a non-breaking space so that it is rendered # by WebKit dest.text = NBSP # If the last element in a block is a <br> the <br> is not rendered in # HTML, unless it is followed by a trailing space. Word, on the other # hand inserts a blank line for trailing <br>s. if len(dest) > 0 and not dest[-1].tail: if dest[-1].tag == 'br': dest[-1].tail = NBSP elif len(dest[-1]) > 0 and dest[-1][ -1].tag == 'br' and not dest[-1][-1].tail: dest[-1][-1].tail = NBSP return dest