def drawing_to_html(self, drawing, page): # First process the inline pictures for inline in XPath('./wp:inline')(drawing): style, alt = get_image_properties(inline) for pic in XPath('descendant::pic:pic')(inline): ans = self.pic_to_img(pic, alt, inline) if ans is not None: if style: ans.set( 'style', '; '.join('%s: %s' % (k, v) for k, v in style.iteritems())) yield ans # Now process the floats for anchor in XPath('./wp:anchor')(drawing): style, alt = get_image_properties(anchor) self.get_float_properties(anchor, style, page) for pic in XPath('descendant::pic:pic')(anchor): ans = self.pic_to_img(pic, alt, anchor) if ans is not None: if style: ans.set( 'style', '; '.join('%s: %s' % (k, v) for k, v in style.iteritems())) yield ans
def get_applicable_xe_fields(index, xe_fields): iet = index.get('entry-type', None) xe_fields = [xe for xe in xe_fields if xe.get('entry-type', None) == iet] lr = index.get('letter-range', None) if lr is not None: sl, el = lr.parition('-')[0::2] sl, el = sl.strip(), el.strip() if sl and el: def inrange(text): return sl <= text[0] <= el xe_fields = [xe for xe in xe_fields if inrange(xe.get('text', ''))] bmark = index.get('bookmark', None) if bmark is None: return xe_fields attr = expand('w:name') bookmarks = { b for b in XPath('//w:bookmarkStart')(xe_fields[0]['start_elem']) if b.get(attr, None) == bmark } ancestors = XPath('ancestor::w:bookmarkStart') def contained(xe): # Check if the xe field is contained inside a bookmark with the # specified name return bool(set(ancestors(xe['start_elem'])) & bookmarks) return [xe for xe in xe_fields if contained(xe)]
def get_image_properties(parent): width = height = None for extent in XPath('./wp:extent')(parent): try: width = emu_to_pt(int(extent.get('cx'))) except (TypeError, ValueError): pass try: height = emu_to_pt(int(extent.get('cy'))) except (TypeError, ValueError): pass ans = {} if width is not None: ans['width'] = '%.3gpt' % width if height is not None: ans['height'] = '%.3gpt' % height alt = None for docPr in XPath('./wp:docPr')(parent): x = docPr.get('descr', None) if x: alt = x if docPr.get('hidden', None) in {'true', 'on', '1'}: ans['display'] = 'none' return ans, alt
def pic_to_img(self, pic, alt, parent): name = None link = None for hl in XPath('descendant::a:hlinkClick[@r:id]')(parent): link = {'id':get(hl, 'r:id')} tgt = hl.get('tgtFrame', None) if tgt: link['target'] = tgt title = hl.get('tooltip', None) if title: link['title'] = title for pr in XPath('descendant::pic:cNvPr')(pic): name = pr.get('name', None) if name: name = ascii_filename(name).replace(' ', '_') alt = pr.get('descr', None) for a in XPath('descendant::a:blip[@r:embed or @r:link]')(pic): rid = get(a, 'r:embed') if not rid: rid = get(a, 'r:link') if rid and rid in self.rid_map: try: src = self.generate_filename(rid, name) except LinkedImageNotFound as err: self.log.warn('Linked image: %s not found, ignoring' % err.fname) continue img = IMG(src='images/%s' % src) img.set('alt', alt or 'Image') if link is not None: self.links.append((img, link)) return img
def create_instance(n, definition): nd = definition.copy() start_overrides = {} for lo in XPath('./w:lvlOverride')(n): try: ilvl = int(get(lo, 'w:ilvl')) except (ValueError, TypeError): ilvl = None for so in XPath('./w:startOverride[@w:val]')(lo): try: start_override = int(get(so, 'w:val')) except (TypeError, ValueError): pass else: start_overrides[ilvl] = start_override for lvl in XPath('./w:lvl')(lo)[:1]: nilvl = get(lvl, 'w:ilvl') ilvl = nilvl if ilvl is None else ilvl alvl = nd.levels.get(ilvl, None) if alvl is None: alvl = Level() alvl.read_from_xml(lvl, override=True) for ilvl, so in start_overrides.iteritems(): try: nd.levels[ilvl].start = start_override except KeyError: pass return nd
def get_hpos(anchor, page_width): for ph in XPath('./wp:positionH')(anchor): rp = ph.get('relativeFrom', None) if rp == 'leftMargin': return 0 if rp == 'rightMargin': return 1 for align in XPath('./wp:align')(ph): al = align.text if al == 'left': return 0 if al == 'center': return 0.5 if al == 'right': return 1 for po in XPath('./wp:posOffset')(ph): try: pos = emu_to_pt(int(po.text)) except (TypeError, ValueError): continue return pos/page_width for sp in XPath('./wp:simplePos')(anchor): try: x = emu_to_pt(sp.get('x', None)) except (TypeError, ValueError): continue return x/page_width return 0
def pic_to_img(self, pic, alt, parent): name = None link = None for hl in XPath('descendant::a:hlinkClick[@r:id]')(parent): link = {'id':get(hl, 'r:id')} tgt = hl.get('tgtFrame', None) if tgt: link['target'] = tgt title = hl.get('tooltip', None) if title: link['title'] = title for pr in XPath('descendant::pic:cNvPr')(pic): name = pr.get('name', None) if name: name = ascii_filename(name).replace(' ', '_') alt = pr.get('descr', None) for a in XPath('descendant::a:blip[@r:embed]')(pic): rid = get(a, 'r:embed') if rid in self.rid_map: src = self.generate_filename(rid, name) img = IMG(src='images/%s' % src) img.set('alt', alt or 'Image') if link is not None: self.links.append((img, link)) return img
def apply_markup(self, rmap, parent=None): table = TABLE('\n\t\t') if parent is None: try: first_para = rmap[next(iter(self))] except StopIteration: return parent = first_para.getparent() idx = parent.index(first_para) parent.insert(idx, table) else: parent.append(table) for row in XPath('./w:tr')(self.tbl): tr = TR('\n\t\t\t') tr.tail = '\n\t\t' table.append(tr) for tc in XPath('./w:tc')(row): td = TD() td.tail = '\n\t\t\t' tr.append(td) for x in XPath('./w:p|./w:tbl')(tc): if x.tag.endswith('}p'): td.append(rmap[x]) else: self.sub_tables[x].apply_markup(rmap, parent=td) if len(tr): tr[-1].tail = '\n\t\t' if len(table): table[-1].tail = '\n\t'
def __call__(self, root): for fs in XPath('//a:fontScheme')(root): for mj in XPath('./a:majorFont')(fs): for l in XPath('./a:latin[@typeface]')(mj): self.major_latin_font = l.get('typeface') for mj in XPath('./a:minorFont')(fs): for l in XPath('./a:latin[@typeface]')(mj): self.minor_latin_font = l.get('typeface')
def read_padding(parent, dest): name = 'tblCellMar' if parent.tag.endswith('}tblPr') else 'tcMar' ans = {x:inherit for x in edges} for mar in XPath('./w:%s' % name)(parent): for x in edges: for edge in XPath('./w:%s' % x)(mar): ans[x] = _read_width(edge) for x in edges: setattr(dest, 'cell_padding_%s' % x, ans[x])
def read_padding(parent, dest): name = 'tblCellMar' if parent.tag.endswith('}tblPr') else 'tcMar' left = top = bottom = right = inherit for mar in XPath('./w:%s' % name)(parent): for x in ('left', 'top', 'right', 'bottom'): for edge in XPath('./w:%s' % x)(mar): locals()[x] = _read_width(edge) for x in ('left', 'top', 'right', 'bottom'): setattr(dest, 'cell_padding_%s' % x, locals()[x])
def __call__(self, doc, log): all_ids = frozenset(XPath('//*/@w:id')(doc)) c = 0 while self.index_bookmark_prefix in all_ids: c += 1 self.index_bookmark_prefix = self.index_bookmark_prefix.replace( '-', '%d-' % c) stack = [] for elem in XPath( '//*[name()="w:p" or name()="w:r" or name()="w:instrText" or (name()="w:fldChar" and (@w:fldCharType="begin" or @w:fldCharType="end"))]' )(doc): if elem.tag.endswith('}fldChar'): typ = get(elem, 'w:fldCharType') if typ == 'begin': stack.append(Field(elem)) self.fields.append(stack[-1]) else: try: stack.pop().end = elem except IndexError: pass elif elem.tag.endswith('}instrText'): if stack: stack[-1].add_instr(elem) else: if stack: stack[-1].contents.append(elem) field_types = ('hyperlink', 'xe', 'index', 'ref', 'noteref') parsers = {x.upper(): getattr(self, 'parse_' + x) for x in field_types} parsers.update({x: getattr(self, 'parse_' + x) for x in field_types}) field_parsers = { f.upper(): globals()['parse_%s' % f] for f in field_types } field_parsers.update( {f: globals()['parse_%s' % f] for f in field_types}) for f in field_types: setattr(self, '%s_fields' % f, []) unknown_fields = { 'TOC', 'toc', 'PAGEREF', 'pageref' } # The TOC and PAGEREF fields are handled separately for field in self.fields: field.finalize() if field.instructions: func = parsers.get(field.name, None) if func is not None: func(field, field_parsers[field.name], log) elif field.name not in unknown_fields: log.warn('Encountered unknown field: %s, ignoring it.' % field.name) unknown_fields.add(field.name)
def read_numbering(parent, dest): lvl = num_id = None for np in XPath('./w:numPr')(parent): for ilvl in XPath('./w:ilvl[@w:val]')(np): try: lvl = int(get(ilvl, 'w:val')) except (ValueError, TypeError): pass for num in XPath('./w:numId[@w:val]')(np): num_id = get(num, 'w:val') val = (num_id, lvl) if num_id is not None or lvl is not None else inherit setattr(dest, 'numbering', val)
def __call__(self, footnotes, footnotes_rels, endnotes, endnotes_rels): if footnotes is not None: for footnote in XPath('./w:footnote[@w:id]')(footnotes): fid = get(footnote, 'w:id') if fid: self.footnotes[fid] = Note(footnote, footnotes_rels) if endnotes is not None: for endnote in XPath('./w:endnote[@w:id]')(endnotes): fid = get(endnote, 'w:id') if fid: self.endnotes[fid] = Note(endnote, endnotes_rels)
def create_instance(n, definition): nd = definition.copy() for lo in XPath('./w:lvlOverride')(n): ilvl = get(lo, 'w:ilvl') for lvl in XPath('./w:lvl')(lo)[:1]: nilvl = get(lvl, 'w:ilvl') ilvl = nilvl if ilvl is None else ilvl alvl = nd.levels.get(ilvl, None) if alvl is None: alvl = Level() alvl.read_from_xml(lvl, override=True) return nd
def __call__(self, root, styles): ' Read all numbering style definitions ' lazy_load = {} for an in XPath('./w:abstractNum[@w:abstractNumId]')(root): an_id = get(an, 'w:abstractNumId') nsl = XPath('./w:numStyleLink[@w:val]')(an) if nsl: lazy_load[an_id] = get(nsl[0], 'w:val') else: nd = NumberingDefinition(an) self.definitions[an_id] = nd def create_instance(n, definition): nd = definition.copy() for lo in XPath('./w:lvlOverride')(n): ilvl = get(lo, 'w:ilvl') for lvl in XPath('./w:lvl')(lo)[:1]: nilvl = get(lvl, 'w:ilvl') ilvl = nilvl if ilvl is None else ilvl alvl = nd.levels.get(ilvl, None) if alvl is None: alvl = Level() alvl.read_from_xml(lvl, override=True) return nd next_pass = {} for n in XPath('./w:num[@w:numId]')(root): an_id = None num_id = get(n, 'w:numId') for an in XPath('./w:abstractNumId[@w:val]')(n): an_id = get(an, 'w:val') d = self.definitions.get(an_id, None) if d is None: next_pass[num_id] = (an_id, n) continue self.instances[num_id] = create_instance(n, d) numbering_links = styles.numbering_style_links for an_id, style_link in lazy_load.iteritems(): num_id = numbering_links[style_link] self.definitions[an_id] = self.instances[num_id].copy() for num_id, (an_id, n) in next_pass.iteritems(): d = self.definitions.get(an_id, None) if d is not None: self.instances[num_id] = create_instance(n, d) for num_id, d in self.instances.iteritems(): self.counters[num_id] = Counter({lvl:d.levels[lvl].start for lvl in d.levels})
def pic_to_img(self, pic, alt=None): name = None for pr in XPath('descendant::pic:cNvPr')(pic): name = pr.get('name', None) if name: name = ascii_filename(name).replace(' ', '_') alt = pr.get('descr', None) for a in XPath('descendant::a:blip[@r:embed]')(pic): rid = get(a, 'r:embed') if rid in self.rid_map: src = self.generate_filename(rid, name) img = IMG(src='images/%s' % src) if alt: img(alt=alt) return img
def resolve_run(self, r): ans = self.run_cache.get(r, None) if ans is None: p = XPath('ancestor::w:p[1]')(r) p = p[0] if p else None ans = self.run_cache[r] = RunStyle() direct_formatting = None for rPr in XPath('./w:rPr')(r): rs = RunStyle(rPr) if direct_formatting is None: direct_formatting = rs else: direct_formatting.update(rs) if direct_formatting is None: direct_formatting = RunStyle() parent_styles = [] default_char = self.default_styles.get('character', None) if self.default_character_style is not None: parent_styles.append(self.default_character_style) pstyle = self.para_char_cache.get(p, None) if pstyle is not None: parent_styles.append(pstyle) # As best as I can understand the spec, table overrides should be # applied before paragraph overrides, but word does it # this way, see the December 2007 table header in the demo # document. ts = self.tables.run_style(p) if ts is not None: parent_styles.append(ts) if direct_formatting.linked_style is not None: ls = getattr(self.get(direct_formatting.linked_style), 'character_style', None) if ls is not None: parent_styles.append(ls) elif default_char is not None and default_char.character_style is not None: parent_styles.append(default_char.character_style) for attr in ans.all_properties: setattr(ans, attr, self.run_val(parent_styles, direct_formatting, attr)) if ans.font_family is not inherit: ff = self.theme.resolve_font_family(ans.font_family) ans.font_family = self.fonts.family_for(ff, ans.b, ans.i) return ans
def read_shd(parent, dest): ans = inherit for shd in XPath('./w:shd[@w:fill]')(parent): val = get(shd, 'w:fill') if val: ans = simple_color(val, auto='transparent') setattr(dest, 'background_color', ans)
def read_indent(parent, dest): padding_left = padding_right = text_indent = inherit for indent in XPath('./w:ind')(parent): l, lc = get(indent, 'w:left'), get(indent, 'w:leftChars') pl = simple_float(lc, 0.01) if lc is not None else simple_float(l, 0.05) if l is not None else None if pl is not None: padding_left = '%.3g%s' % (pl, 'em' if lc is not None else 'pt') r, rc = get(indent, 'w:right'), get(indent, 'w:rightChars') pr = simple_float(rc, 0.01) if rc is not None else simple_float(r, 0.05) if r is not None else None if pr is not None: padding_right = '%.3g%s' % (pr, 'em' if rc is not None else 'pt') h, hc = get(indent, 'w:hanging'), get(indent, 'w:hangingChars') fl, flc = get(indent, 'w:firstLine'), get(indent, 'w:firstLineChars') h = h if h is None else '-'+h hc = hc if hc is None else '-'+hc ti = (simple_float(hc, 0.01) if hc is not None else simple_float(h, 0.05) if h is not None else simple_float(flc, 0.01) if flc is not None else simple_float(fl, 0.05) if fl is not None else None) if ti is not None: text_indent = '%.3g%s' % (ti, 'em' if hc is not None or (h is None and flc is not None) else 'pt') setattr(dest, 'margin_left', padding_left) setattr(dest, 'margin_right', padding_right) setattr(dest, 'text_indent', text_indent)
def read_page_properties(self, doc): current = [] self.page_map = OrderedDict() self.section_starts = [] for p in descendants(doc, 'w:p', 'w:tbl'): if p.tag.endswith('}tbl'): self.tables.register(p, self.styles) current.append(p) continue sect = tuple(descendants(p, 'w:sectPr')) if sect: pr = PageProperties(sect) paras = current + [p] for x in paras: self.page_map[x] = pr self.section_starts.append(paras[0]) current = [] else: current.append(p) if current: self.section_starts.append(current[0]) last = XPath('./w:body/w:sectPr')(doc) pr = PageProperties(last) for x in current: self.page_map[x] = pr
def read_default_style_language(raw, mi): root = fromstring(raw) for lang in XPath('/w:styles/w:docDefaults/w:rPrDefault/w:rPr/w:lang/@w:val')(root): lang = canonicalize_lang(lang) if lang: mi.languages = [lang] break
def read_underline(parent, dest): ans = inherit for col in XPath('./w:u[@w:val]')(parent): val = get(col, 'w:val') if val: ans = val if val == 'none' else 'underline' setattr(dest, 'text_decoration', ans)
def pict_to_html(self, pict, page): # First see if we have an <hr> is_hr = len(pict) == 1 and get(pict[0], 'o:hr') in {'t', 'true'} if is_hr: style = {} hr = HR() try: pct = float(get(pict[0], 'o:hrpct')) except (ValueError, TypeError, AttributeError): pass else: if pct > 0: style['width'] = '%.3g%%' % pct align = get(pict[0], 'o:hralign', 'center') if align in {'left', 'right'}: style['margin-left'] = '0' if align == 'left' else 'auto' style['margin-right'] = 'auto' if align == 'left' else '0' if style: hr.set('style', '; '.join(('%s:%s' % (k, v) for k, v in style.iteritems()))) yield hr for imagedata in XPath('descendant::v:imagedata[@r:id]')(pict): rid = get(imagedata, 'r:id') if rid in self.rid_map: try: src = self.generate_filename(rid) except LinkedImageNotFound as err: self.log.warn('Linked image: %s not found, ignoring' % err.fname) continue img = IMG(src='images/%s' % src, style="display:block") alt = get(imagedata, 'o:title') img.set('alt', alt or 'Image') yield img
def read_letter_spacing(parent, dest): ans = inherit for col in XPath('./w:spacing[@w:val]')(parent): val = simple_float(get(col, 'w:val'), 0.05) if val is not None: ans = val setattr(dest, 'letter_spacing', ans)
def __init__(self, rPr=None): self.linked_style = None if rPr is None: for p in self.all_properties: setattr(self, p, inherit) else: for p in ( 'b', 'bCs', 'caps', 'cs', 'dstrike', 'emboss', 'i', 'iCs', 'imprint', 'rtl', 'shadow', 'smallCaps', 'strike', 'vanish', 'webHidden', ): setattr(self, p, binary_property(rPr, p)) for x in ('text_border', 'color', 'highlight', 'shd', 'letter_spacing', 'sz', 'underline', 'vert_align', 'lang', 'font_family'): f = globals()['read_%s' % x] f(rPr, self) for s in XPath('./w:rStyle[@w:val]')(rPr): self.linked_style = get(s, 'w:val') self._css = None
def read_vert_align(parent, dest): ans = inherit for col in XPath('./w:vertAlign[@w:val]')(parent): val = get(col, 'w:val') if val and val in {'baseline', 'subscript', 'superscript'}: ans = val setattr(dest, 'vert_align', ans)
def read_single_border(parent, edge): color = style = width = padding = None for elem in XPath('./w:%s' % edge)(parent): c = get(elem, 'w:color') if c is not None: color = simple_color(c) s = get(elem, 'w:val') if s is not None: style = LINE_STYLES.get(s, 'solid') space = get(elem, 'w:space') if space is not None: try: padding = float(space) except (ValueError, TypeError): pass sz = get(elem, 'w:sz') if sz is not None: # we dont care about art borders (they are only used for page borders) try: # WebKit needs at least 1pt to render borders width = min(96, max(8, float(sz))) / 8 except (ValueError, TypeError): pass if style == 'double' and width is not None and 0 < width < 3: width = 3 # WebKit needs 3pts to render double borders return {p:v for p, v in zip(border_props, (padding, width, style, color))}
def read_text_border(parent, dest): border_color = border_style = border_width = padding = inherit elems = XPath('./w:bdr')(parent) if elems: border_color = simple_color('auto') border_style = 'solid' border_width = 1 for elem in elems: color = get(elem, 'w:color') if color is not None: border_color = simple_color(color) style = get(elem, 'w:val') if style is not None: border_style = LINE_STYLES.get(style, 'solid') space = get(elem, 'w:space') if space is not None: try: padding = float(space) except (ValueError, TypeError): pass sz = get(elem, 'w:sz') if sz is not None: # we dont care about art borders (they are only used for page borders) try: # A border of less than 1pt is not rendered by WebKit border_width = min(96, max(8, float(sz))) / 8 except (ValueError, TypeError): pass setattr(dest, 'border_color', border_color) setattr(dest, 'border_style', border_style) setattr(dest, 'border_width', border_width) setattr(dest, 'padding', padding)
def read_sz(parent, dest): ans = inherit for col in XPath('./w:sz[@w:val]')(parent): val = simple_float(get(col, 'w:val'), 0.5) if val is not None: ans = val setattr(dest, 'font_size', ans)