def read_section(self, node: Any) -> None: attrs = XmlPreprocessor.get_clear_attributes(node) num_str = attrs.get('abstractNumId') if not num_str: return sect_index = int(num_str) sect = NumberingSetsSection() restart_num_str = attrs.get('restartNumberingAfterBreak') or '' sect.restart_after_break = restart_num_str != '0' self.collections[sect_index] = sect go_deeper = True for child in node: tag = XmlPreprocessor.get_clear_tag(child) # find "<w:lvl w:ilvl="0">" if tag == 'lvl': self.read_numbering_sets(child, sect) go_deeper = False try: if not go_deeper: return for child in node: self.read_section(child) finally: sect.initialize()
def explore_paragraph_numpr(self, pr: DocParagraph, node: Any): for elt in node: tag = XmlPreprocessor.get_clear_tag(elt) if tag == 'ilvl': style_val = XmlPreprocessor.get_clear_attributes(elt).get('val') if style_val: pr.list_level = int(style_val) elif tag == 'numId': style_val = XmlPreprocessor.get_clear_attributes(elt).get('val') if style_val: pr.list_number = int(style_val)
def explore_paragraph_properties(self, pr: DocParagraph, node: Any): for elt in node: tag = XmlPreprocessor.get_clear_tag(elt) if tag == 'pStyle': style_val = XmlPreprocessor.get_clear_attributes(elt).get('val') if style_val: self.apply_paragraph_style(pr, style_val) elif tag == 'numPr': self.explore_paragraph_numpr(pr, elt) else: self.explore_paragraph_properties(pr, elt)
def add_hyperlink(self, node: Any, container: List[DocItem]) -> None: text_child = XmlPreprocessor.find_children_by_tag(node, 't', first_only=True) if not text_child or not text_child[0].text: return link = self.make_hyperlink_instance() link.text = text_child[0].text # get link's reference # r:id="rId4" -> self.relationship rel_id = XmlPreprocessor.get_clear_attribute_val(node, 'id') if rel_id: rel_link = self.relationship.rel_by_id.get(rel_id) or '' link.link = rel_link container.append(link)
def explore_section(self, node: Any, style_set: StyleSets): for child in node: tag = XmlPreprocessor.get_clear_tag(child) if tag == 'aliases': val = XmlPreprocessor.get_clear_attribute_val(child, 'val') aliases = set((val or '').split(',')) style_set.aliases = aliases continue if tag == 'ilvl': val = XmlPreprocessor.get_clear_attribute_val(child, 'val') style_set.ilvl = int(val) continue if tag == 'numId': val = XmlPreprocessor.get_clear_attribute_val(child, 'val') style_set.numId = int(val) continue self.explore_section(child, style_set)
def traverse_doc_tree(self, node: Any, items_container: List[DocItem]): # fill "Items" for elt in node: tag = XmlPreprocessor.get_clear_tag(elt) if tag == 'p': self.add_paragraph(elt, items_container) elif tag == 'tbl': self.add_table(elt, items_container) else: self.traverse_doc_tree(elt, items_container)
def read_num_map(self, node: Any) -> None: """ <w:num w:numId="1"> <w:abstractNumId w:val="1" /> </w:num> """ num_id_str = XmlPreprocessor.get_clear_attribute_val(node, 'numId') if not num_id_str: return num_id = int(num_id_str) # list number for child in node: tag = XmlPreprocessor.get_clear_tag(child) if tag != 'abstractNumId': continue val_str = XmlPreprocessor.get_clear_attribute_val(child, 'val') if not val_str: break sect_id = int(val_str) self.list_num_to_section_id[num_id] = sect_id return
def traverse_tree(self, node) -> None: go_deeper = True for child in node: tag = XmlPreprocessor.get_clear_tag(child) if tag == 'Relationship': self.parse_relationship(child) go_deeper = False if go_deeper: for child in node: self.traverse_tree(child)
def read_numbering_sets(self, node: Any, sect: NumberingSetsSection): nm_set = NumberingSets() lvl = XmlPreprocessor.get_clear_attribute_val(node, 'ilvl') nm_set.level = int(lvl) for child in node: tag = XmlPreprocessor.get_clear_tag(child) if tag == 'start': nm_set.start = int( XmlPreprocessor.get_clear_attribute_val(child, 'val') or '0') continue if tag == 'numFmt': nm_set.num_fmt = \ XmlPreprocessor.get_clear_attribute_val(child, 'val') or 'bullet' continue if tag == 'lvlText': nm_set.lvl_text = \ XmlPreprocessor.get_clear_attribute_val(child, 'val') or '*' continue if tag == 'lvlJc': nm_set.lvl_jc = \ XmlPreprocessor.get_clear_attribute_val(child, 'val') or 'left' continue sect.sets.append(nm_set)
def fill_table_rows(self, tbl: DocTable, node: Any): go_deeper = True for elt in node: tag = XmlPreprocessor.get_clear_tag(elt) if tag == 'tr': go_deeper = False self.fill_table_row(tbl, elt) if not go_deeper: return for elt in node: self.fill_table_rows(tbl, elt)
def traverse_tree(self, node: Any) -> None: # looking for "<w:style w:type="paragraph" w:styleId="Heading2">" get_section = False for child in node: tag = XmlPreprocessor.get_clear_tag(child) if tag == 'style': self.read_section(child) get_section = True if get_section: return for child in node: self.traverse_tree(child)
def fill_table_row(self, tbl: DocTable, node: Any): row = DocTableRow() tbl.rows.append(row) go_deeper = True for elt in node: tag = XmlPreprocessor.get_clear_tag(elt) if tag == 'tc': go_deeper = False self.fill_table_cell(row, elt) if not go_deeper: return for elt in node: self.fill_table_row(tbl, elt)
def explore_paragraph(self, pr: DocParagraph, node: Any): for child in node: tag = XmlPreprocessor.get_clear_tag(child) if tag == 'pPr': self.explore_paragraph_properties(pr, child) elif tag == 'hyperlink': self.add_hyperlink(child, pr.text_items) elif tag == 'tab': pr.text_items.append(DocText('\t')) elif tag == 't': if child.text: pr.text_items.append(DocText(child.text)) elif tag == 'r': self.explore_paragraph(pr, child) else: self.explore_paragraph(pr, child)
def traverse_tree(self, node: Any) -> None: # looking for "<w:abstractNum w:abstractNumId="1" restartNumberingAfterBreak="0">" get_section = False get_num_map = False for child in node: tag = XmlPreprocessor.get_clear_tag(child) if tag == 'abstractNum': self.read_section(child) get_section = True elif tag == 'num': get_num_map = True self.read_num_map(child) if get_section and get_num_map: return for child in node: self.traverse_tree(child)
def read_section(self, node: Any) -> None: """ <w:style w:type="paragraph" w:styleId="Heading2"> <w:aliases w:val="1.1,2nd,B Sub/Bold,B Sub/Bold1,B Sub/Bold11,B Sub/Bold12,B Sub/Bold13" /> <w:pPr> <w:keepNext /> <w:numPr> <w:ilvl w:val="1" /> <w:numId w:val="19" /> </w:numPr> </w:pPr> </w:num> """ style_id = XmlPreprocessor.get_clear_attribute_val(node, 'styleId') if not style_id: return style_set = StyleSets() style_set.styleId = style_id self.sets[style_id] = style_set try: self.explore_section(node, style_set) except: del self.sets[style_id]
def test_remove_namespace(self): text = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main"}val' self.assertEqual('val', XmlPreprocessor.remove_namespace(text)) self.assertEqual('val', XmlPreprocessor.remove_namespace('val'))