class Table(XmlModel): XML_TAG = 'tbl' rows = XmlCollection(TableRow, ) def calculate_table_cell_spans(self): if not self.rows: return active_rowspan_cells_by_column = {} cell_to_rowspan_count = defaultdict(int) for row in self.rows: for column_index, cell in enumerate(row.cells): properties = cell.properties # If this element is omitted, then this cell shall not be # part of any vertically merged grouping of cells, and any # vertically merged group of preceding cells shall be # closed. if properties is None or properties.vertical_merge is None: # if properties are missing, this is the same as the # the element being omitted active_rowspan_cells_by_column[column_index] = None elif properties: vertical_merge = properties.vertical_merge.get( 'val', 'continue') # noqa if vertical_merge == 'restart': active_rowspan_cells_by_column[column_index] = cell cell_to_rowspan_count[cell] += 1 elif vertical_merge == 'continue': active_rowspan_for_column = active_rowspan_cells_by_column.get( column_index) # noqa if active_rowspan_for_column: cell_to_rowspan_count[ active_rowspan_for_column] += 1 # noqa return dict(cell_to_rowspan_count)
class ItemsModel(XmlModel): XML_TAG = 'items' children = XmlCollection( ('apple', AppleModel), OrangeModel, )
class SimpleField(XmlModel): XML_TAG = 'fldSimple' instr = XmlAttribute() children = XmlCollection( Run, Hyperlink, SmartTagRun, InsertedRun, DeletedRun, SdtRun, ) def _parse_instr_into_field_type_and_arg_string(self): return re.match('^\s*([^\s]+)\s*(.*)$', self.instr) def _parse_instr_arg_string_to_args(self, arg_string): return re.findall(r'\s*(?:"([^"]+)"|([^\s]+))+', arg_string) def parse_instr(self): m = self._parse_instr_into_field_type_and_arg_string() if not m: return field_type = m.group(1) raw_field_args = m.group(2) if not raw_field_args: return field_type, None m = self._parse_instr_arg_string_to_args(raw_field_args) if not m: return field_type, None field_args = [args[0] if args[0] else args[1] for args in m] return field_type, field_args
class Hyperlink(XmlModel): XML_TAG = 'hyperlink' hyperlink_id = XmlAttribute(name='id') anchor = XmlAttribute(name='anchor') children = XmlCollection(Run, ) @memoized def get_target_uri(self): if not self.container: return None if not self.container.package_part: return None package_part = self.container.package_part try: relationship = package_part.get_relationship( relationship_id=self.hyperlink_id, ) except KeyError: return None if self.anchor: return '{0}#{1}'.format(relationship.target_uri, self.anchor) else: return relationship.target_uri @property def target_uri(self): return self.get_target_uri() @target_uri.setter def target_uri(self, target_uri): self.get_target_uri.memo.set_cache(target_uri, self)
class SmartTagRun(XmlModel): XML_TAG = 'smartTag' children = XmlCollection( Run, 'wordprocessing.SmartTagRun', )
class Numbering(XmlModel): XML_TAG = 'numbering' elements = XmlCollection(AbstractNum, NumberingInstance) def __init__(self, **kwargs): super(Numbering, self).__init__(**kwargs) self._abstract_nums = {} self._nums = {} for el in self.elements: if isinstance(el, AbstractNum): self._abstract_nums[el.abstract_num_id] = el elif isinstance(el, NumberingInstance): self._nums[el.num_id] = el else: raise AssertionError( 'Unexpected element type {type} encountered'.format( type=el.__class__.__name__, )) def get_numbering_definition(self, num_id): num = self._nums.get(num_id) if not num: return return self._abstract_nums.get(num.abstract_num_id)
class AbstractNum(XmlModel): XML_TAG = 'abstractNum' abstract_num_id = XmlAttribute(name='abstractNumId') name = XmlChild(attrname='val') levels = XmlCollection(Level) def __init__(self, **kwargs): super(AbstractNum, self).__init__(**kwargs) self._levels = {} for level in self.levels: self._levels[level.level_id] = level def get_level(self, level_id): return self._levels.get(level_id) def get_indentation_between_levels(self): """ Depending on the word version we may get a different default indentation between levels. For this we will only check first 2 levels as the other follow the same step. """ try: lvl0_ind = self.levels[0].paragraph_properties.to_int( 'indentation_left', default=0) lvl1_ind = self.levels[1].paragraph_properties.to_int( 'indentation_left', default=0) ind_step = lvl1_ind - lvl0_ind except IndexError: ind_step = 720 # default one return ind_step
class NumberingInstance(XmlModel): XML_TAG = 'num' num_id = XmlAttribute(name='numId') abstract_num_id = XmlChild(name='abstractNumId', attrname='val') level_overrides = XmlCollection(LevelOverride)
class TableCell(XmlModel): XML_TAG = 'tc' properties = XmlChild(type=TableCellProperties) children = XmlCollection(Paragraph, # Table is added in wordprocessing.table )
class InsertedRun(XmlModel): XML_TAG = 'ins' children = XmlCollection( Run, SmartTagRun, # TODO Needs DeletedRun )
class Fallback(XmlModel): XML_TAG = 'Fallback' # It would be better to refer to the grandparent's children XmlCollection # lazily. The problem is that we don't have a good way to represent lazy # fields, nor do we have a way for those fields to reference their parents # before they are initialized with content. children = XmlCollection(allow_all_children=True)
class EmbeddedObject(XmlModel): """ reference: https://msdn.microsoft.com/en-us/library/documentformat.openxml .wordprocessing.embeddedobject%28v=office.15%29.aspx """ XML_TAG = 'object' children = XmlCollection(Shape)
class DeletedRun(XmlModel): XML_TAG = 'del' children = XmlCollection( Run, SmartTagRun, 'wordprocessing.DeletedRun', # TODO Needs InsertedRun )
class SdtContentBlock(XmlModel): XML_TAG = 'sdtContent' children = XmlCollection( Paragraph, Table, InsertedRun, DeletedRun, # SdtBlock, )
class Endnote(XmlModel): XML_TAG = 'endnote' endnote_id = XmlAttribute(name='id') children = XmlCollection( Paragraph, Table, InsertedRun, DeletedRun, )
class SdtContentRun(XmlModel): XML_TAG = 'sdtContent' children = XmlCollection( Run, Hyperlink, SmartTagRun, InsertedRun, DeletedRun, # SdtRun, )
class Shape(XmlModel): XML_TAG = 'shape' style = XmlAttribute() children = XmlCollection(ImageData, 'vml.Textbox') # TODO perhaps we could have a prepare_style, or clean_style convention? def get_style(self): if self.style: return dict( item.split(':', 1) for item in self.style.split(';') if item) return {}
class Styles(XmlModel): XML_TAG = 'styles' styles = XmlCollection(Style) def __init__(self, styles=None, *args, **kwargs): super(Styles, self).__init__(styles=styles, *args, **kwargs) styles_by_type = defaultdict(dict) for style in self.styles: styles_by_type[style.style_type][style.style_id] = style self.styles_by_type = dict(styles_by_type) def get_styles_by_type(self, style_type): return self.styles_by_type.get(style_type, {})
class Endnotes(XmlModel): XML_TAG = 'endnotes' children = XmlCollection(Endnote) def __init__(self, *args, **kwargs): super(Endnotes, self).__init__(*args, **kwargs) endnote_by_id = {} for endnote in self.children: if endnote.endnote_id: endnote_by_id[endnote.endnote_id] = endnote self._endnote_by_id = endnote_by_id def get_endnote_by_id(self, endnote_id): return self._endnote_by_id.get(endnote_id)
class Footnotes(XmlModel): XML_TAG = 'footnotes' children = XmlCollection(Footnote) def __init__(self, *args, **kwargs): super(Footnotes, self).__init__(*args, **kwargs) footnote_by_id = {} for footnote in self.children: if footnote.footnote_id: footnote_by_id[footnote.footnote_id] = footnote self._footnote_by_id = footnote_by_id def get_footnote_by_id(self, footnote_id): return self._footnote_by_id.get(footnote_id)
class AbstractNum(XmlModel): XML_TAG = 'abstractNum' abstract_num_id = XmlAttribute(name='abstractNumId') name = XmlChild(attrname='val') levels = XmlCollection(Level) def __init__(self, **kwargs): super(AbstractNum, self).__init__(**kwargs) self._levels = {} for level in self.levels: self._levels[level.level_id] = level def get_level(self, level_id): return self._levels.get(level_id)
class Textbox(XmlModel): XML_TAG = 'textbox' children = XmlCollection('wordprocessing.TxBxContent', )
class TxBxContent(XmlModel): XML_TAG = 'txbxContent' children = XmlCollection( 'wordprocessing.Paragraph', 'wordprocessing.Table', )
class SmartTagRun(XmlModel): XML_TAG = 'smartTag' children = XmlCollection(Run, )
class Paragraph(XmlModel): XML_TAG = 'p' properties = XmlChild(type=ParagraphProperties) children = XmlCollection( Run, Hyperlink, SmartTagRun, InsertedRun, DeletedRun, SdtRun, SimpleField, ) def __init__(self, **kwargs): super(Paragraph, self).__init__(**kwargs) self._effective_properties = None @property def effective_properties(self): # TODO need to calculate effective properties like Run if not self._effective_properties: properties = self.properties self._effective_properties = properties return self._effective_properties def has_structured_document_parent(self): from pydocx.openxml.wordprocessing import SdtBlock return self.has_ancestor(SdtBlock) def get_style_chain_stack(self): if not self.properties: return parent_style = self.properties.parent_style if not parent_style: return # TODO the getattr is necessary because of footnotes. From the context # of a footnote, a paragraph's container is the footnote part, which # doesn't have access to the style_definitions_part part = getattr(self.container, 'style_definitions_part', None) if part: style_stack = part.get_style_chain_stack('paragraph', parent_style) for result in style_stack: yield result @property def heading_style(self): if hasattr(self, '_heading_style'): return getattr(self, '_heading_style') style_stack = self.get_style_chain_stack() heading_style = None for style in style_stack: if style.is_a_heading(): heading_style = style break self.heading_style = heading_style return heading_style @heading_style.setter def heading_style(self, style): self._heading_style = style def get_numbering_definition(self): # TODO add memoization # TODO the getattr is necessary because of footnotes. From the context # of a footnote, a paragraph's container is the footnote part, which # doesn't have access to the numbering_definitions_part part = getattr(self.container, 'numbering_definitions_part', None) if not part: return if not self.effective_properties: return numbering_properties = self.effective_properties.numbering_properties if not numbering_properties: return return part.numbering.get_numbering_definition( num_id=numbering_properties.num_id, ) def get_numbering_level(self): # TODO add memoization numbering_definition = self.get_numbering_definition() if not numbering_definition: return if not self.effective_properties: return numbering_properties = self.effective_properties.numbering_properties if not numbering_properties: return return numbering_definition.get_level( level_id=numbering_properties.level_id, ) @property def runs(self): for p_child in self.children: if isinstance(p_child, Run): yield p_child def get_text(self, tab_char=None): ''' Return a string of all of the contained Text nodes concatenated together. If `tab_char` is set, then any TabChar encountered will be represented in the returned text using the specified string. For example: Given the following paragraph XML definition: <p> <r> <t>abc</t> </r> <r> <t>def</t> </r> </p> `get_text()` will return 'abcdef' ''' text = [] for run in self.runs: for r_child in run.children: if isinstance(r_child, Text): if r_child.text: text.append(r_child.text) if tab_char and isinstance(r_child, TabChar): text.append(tab_char) return ''.join(text) def get_number_of_initial_tabs(self): ''' Return the number of initial TabChars. ''' tab_count = 0 for p_child in self.children: if isinstance(p_child, Run): for r_child in p_child.children: if isinstance(r_child, TabChar): tab_count += 1 else: break else: break return tab_count
class AlternateContent(XmlModel): XML_TAG = 'AlternateContent' children = XmlCollection(Fallback)
class TableRow(XmlModel): XML_TAG = 'tr' cells = XmlCollection(TableCell, )
class Picture(XmlModel): XML_TAG = 'pict' children = XmlCollection(Shape, Rect)
class Run(XmlModel): XML_TAG = 'r' properties = XmlChild(type=RunProperties) children = XmlCollection( EmbeddedObject, TabChar, Break, NoBreakHyphen, Text, Drawing, Picture, DeletedText, FootnoteReference, FootnoteReferenceMark, EndnoteReference, EndnoteReferenceMark, FieldChar, FieldCode, AlternateContent, ) def get_style_chain_stack(self): if not self.properties: return parent_style = self.properties.parent_style if not parent_style: return # TODO the getattr is necessary because of footnotes. From the context # of a footnote, a paragraph's container is the footnote part, which # doesn't have access to the style_definitions_part part = getattr(self.container, 'style_definitions_part', None) if part: style_stack = part.get_style_chain_stack('character', parent_style) for result in style_stack: yield result def _get_properties_inherited_from_parent_paragraph(self): from pydocx.openxml.wordprocessing.paragraph import Paragraph inherited_properties = {} parent_paragraph = self.get_first_ancestor(Paragraph) if parent_paragraph: style_stack = parent_paragraph.get_style_chain_stack() for style in reversed(list(style_stack)): if style.run_properties: inherited_properties.update( dict(style.run_properties.fields), ) return inherited_properties def _get_inherited_properties_from_parent_style(self): inherited_properties = {} style_stack = self.get_style_chain_stack() for style in reversed(list(style_stack)): if style.run_properties: inherited_properties.update(dict( style.run_properties.fields), ) return inherited_properties @property def inherited_properties(self): properties = {} properties.update( self._get_properties_inherited_from_parent_paragraph(), ) properties.update(self._get_inherited_properties_from_parent_style(), ) return RunProperties(**properties) @property @memoized def effective_properties(self): inherited_properties = self.inherited_properties effective_properties = {} effective_properties.update(dict(inherited_properties.fields)) if self.properties: effective_properties.update(dict(self.properties.fields)) return RunProperties(**effective_properties)