def test_complex_docprop_with_multiple_textnode_in_same_run_are_updated( self): document = Document(docx_path('two_textnodes_in_run_docproperty.docx')) paragraphs = xpath(document.element.body, '//w:p') assert len(paragraphs) == 1, 'input file contains one paragraph' assert len(xpath(document.element.body, '//w:instrText')) == 1, \ 'input contains one complex field docproperty' w_p = paragraphs[0] cached_values = cached_complex_field_values(w_p) assert len(cached_values) == 2, \ 'doc property value is scattered over 2 parts' assert ''.join(cached_values) == 'Hello there' CustomProperties(document).update_all() w_p = xpath(document.element.body, '//w:p')[0] cached_values = cached_complex_field_values(w_p) assert len(cached_values) == 1, \ 'doc property value has been reset to one cached value' assert cached_values[0] == 'i will be spllchecked!'
def add_diagrams(self, doc, element): # While waiting docxcompose 1.3.3 dgm_rels = xpath(element, './/dgm:relIds[@r:dm]') for dgm_rel in dgm_rels: for item, rt_type in (('dm', RT.DIAGRAM_DATA), ('lo', RT.DIAGRAM_LAYOUT), ('qs', RT.DIAGRAM_QUICK_STYLE), ('cs', RT.DIAGRAM_COLORS)): dm_rid = dgm_rel.get('{%s}%s' % (NS['r'], item)) dm_part = doc.part.rels[dm_rid].target_part new_rid = self.doc.part.relate_to(dm_part, rt_type) dgm_rel.set('{%s}%s' % (NS['r'], item), new_rid)
def test_complex_docprop_with_multiple_textnode_in_same_run_are_updated(): document = Document(docx_path('two_textnodes_in_run_docproperty.docx')) paragraphs = xpath(document.element.body, '//w:p') assert 1 == len(paragraphs), 'input file contains one paragraph' assert 1 == len(xpath(document.element.body, '//w:instrText')), \ 'input contains one complex field docproperty' w_p = paragraphs[0] cached_value = xpath(w_p, XPATH_CACHED_DOCPROPERTY_VALUES) assert 2 == len(cached_value), \ 'doc property value is scattered over 2 parts' assert 'Hello there' == ''.join( each.text for each in cached_value) CustomProperties(document).update_all() w_p = xpath(document.element.body, '//w:p')[0] cached_value = xpath(w_p, XPATH_CACHED_DOCPROPERTY_VALUES) assert 1 == len(cached_value), \ 'doc property value has been reset to one cached value' assert 'i will be spllchecked!' == cached_value[0].text
def __setitem__(self, key, value): """Set the value of a property.""" props = xpath(self._element, u'.//cp:property[@name="{}"]'.format(key)) if not props: self.add(key, value) return value_el = props[0][0] new_value_el = value2vt(value) value_el.getparent().replace(value_el, new_value_el) self._update_part()
def _next_numbering_ids(self): numbering_part = self.numbering_part() # Determine next unused numId (numbering starts with 1) current_num_ids = [ n.numId for n in xpath(numbering_part.element, './/w:num')] if current_num_ids: next_num_id = max(current_num_ids) + 1 else: next_num_id = 1 # Determine next unused abstractNumId (numbering starts with 0) current_anum_ids = [ int(n) for n in xpath(numbering_part.element, './/w:abstractNum/@w:abstractNumId')] if current_anum_ids: next_anum_id = max(current_anum_ids) + 1 else: next_anum_id = 0 return next_num_id, next_anum_id
def add_shapes(self, doc, element): shapes = xpath(element, './/v:shape/v:imagedata') for shape in shapes: rid = shape.get('{%s}id' % NS['r']) img_part = doc.part.rels[rid].target_part new_img_part = self.pkg.image_parts._get_by_sha1(img_part.sha1) if new_img_part is None: image = ImageWrapper(img_part) new_img_part = self.pkg.image_parts._add_image_part(image) new_rid = self.doc.part.relate_to(new_img_part, RT.IMAGE) shape.set('{%s}id' % NS['r'], new_rid)
def _get_fieldname_string(self): """The field name can be split up in several instrText runs so we look for all the instrText nodes between the begin and either separate or end runs """ separate_run = self.get_separate_run() last = (self.w_p.index(separate_run) if separate_run is not None else self.w_p.index(self.end_run)) runs = [run for run in self._runs if self.w_p.index(run) < last] texts = [] for run in runs: texts.extend(xpath(run, self.XPATH_TEXTS)) return "".join([each.text for each in texts])
def test_multiple_identical_docprops_get_updated(self): document = Document(docx_path('multiple_identical_properties.docx')) assert 3 == len(document.paragraphs), 'input file should contain 3 paragraphs' for paragraph in document.paragraphs: assert 1 == len(xpath(paragraph._p, './/w:instrText')), \ 'paragraph should contain one complex field docproperties' assert u'Foo' == paragraph.text CustomProperties(document).update_all() for i, paragraph in enumerate(document.paragraphs): assert u'Bar' == paragraph.text, 'docprop {} was not updated'.format(i+1)
def get_separate_run(self): """The ooxml format standard says that the separate node is optional, so we check whether we find one in our complex field, otherwise we return None.""" separates = xpath(self.w_r, self.XPATH_FOLLOWING_SEPARATES) if not separates: return None separate = separates[0] if not self.w_p.index(separate) < self.w_p.index(self.end_run): return None return separate
def test_removes_simple_field_but_keeps_value(self): document = Document( docx_path('outdated_docproperty_with_umlauts.docx')) assert 1 == len( document.paragraphs), 'input file should contain 1 paragraph' fields = xpath( document.element.body, u'.//w:fldSimple[contains(@w:instr, \'DOCPROPERTY "F\xfc\xfc"\')]//w:t' ) assert 1 == len(fields), 'should contain one simple field docproperty' assert u'Hie chund ds property: ' == document.paragraphs[0].text assert u'xxx' == fields[0].text CustomProperties(document).dissolve_fields(u"F\xfc\xfc") fields = xpath( document.element.body, u'.//w:fldSimple[contains(@w:instr, \'DOCPROPERTY "F\xfc\xfc"\')]//w:t' ) assert 0 == len(fields), 'should not contain any docproperties anymore' # when simple field is removed, the value is moved one up in the hierarchy assert u'Hie chund ds property: xxx' == document.paragraphs[0].text
def nullify(self, key): """Delete key for non text-properties, set key to empty string for text. """ props = xpath(self._element, u'.//cp:property[@name="{}"]'.format(key)) if not props: raise KeyError(key) if is_text_property(props[0][0]): self[key] = '' else: del self[key]
def add_images(self, doc, element): """Add images from the given document used in the given element.""" blips = xpath(element, '(.//a:blip|.//asvg:svgBlip)[@r:embed]') for blip in blips: rid = blip.get('{%s}embed' % NS['r']) img_part = doc.part.rels[rid].target_part new_img_part = self.pkg.image_parts._get_by_sha1(img_part.sha1) if new_img_part is None: image = ImageWrapper(img_part) new_img_part = self.pkg.image_parts._add_image_part(image) new_rid = self.doc.part.relate_to(new_img_part, RT.IMAGE) blip.set('{%s}embed' % NS['r'], new_rid)
def test_multiple_identical_docprops_get_updated(self): document = Document(docx_path('multiple_identical_properties.docx')) assert len( document.paragraphs) == 3, 'input file should contain 3 paragraphs' for paragraph in document.paragraphs: assert len(xpath(paragraph._p, './/w:instrText')) == 1, \ 'paragraph should contain one complex field docproperties' assert paragraph.text == u'Foo' CustomProperties(document).update("Text Property", "New value") for i, paragraph in enumerate(document.paragraphs): assert paragraph.text == u'New value',\ 'docprop {} was not updated'.format(i+1)
def __delitem__(self, key): """Delete a property.""" props = xpath(self._element, u'.//cp:property[@name="{}"]'.format(key)) if not props: raise KeyError(key) props[0].getparent().remove(props[0]) # Renumber pids pid = MIN_PID for prop in self._element: prop.set('pid', text_type(pid)) pid += 1 self._update_part()
def add(self, name, value): """Add a property.""" pids = [int(pid) for pid in xpath(self._element, u'.//cp:property/@pid')] if pids: pid = max(pids) + 1 else: pid = MIN_PID prop = parse_xml('<cp:property xmlns:cp="{}"/>'.format(NS['cp'])) prop.set('fmtid', CUSTOM_PROPERTY_FMTID) prop.set('name', name) prop.set('pid', text_type(pid)) value_el = value2vt(value) prop.append(value_el) self._element.append(prop) self._update_part()
def update(self, name): """Update a property field value.""" value = self.get(name) if isinstance(value, bool): value = u'Y' if value else u'N' elif isinstance(value, datetime): value = value.strftime('%x') else: value = text_type(value) # Simple field sfield = xpath( self.doc.element.body, u'.//w:fldSimple[contains(@w:instr, \'DOCPROPERTY "{}"\')]'.format(name)) if sfield: text = xpath(sfield[0], './/w:t') if text: text[0].text = value # Complex field cfield = xpath( self.doc.element.body, u'.//w:instrText[contains(.,\'DOCPROPERTY "{}"\')]'.format(name)) if cfield: w_p = cfield[0].getparent().getparent() runs = xpath( w_p, u'.//w:r[following-sibling::w:r/w:fldChar/@w:fldCharType="end"' u' and preceding-sibling::w:r/w:fldChar/@w:fldCharType="separate"]') if runs: first_w_r = runs[0] text = xpath(first_w_r, u'.//w:t') if text: text[0].text = value # remove any additional text-nodes inside the first run. we # update the first text-node only with the full cached # docproperty value. if for some reason the initial cached # value is split into multiple text nodes we remove any # additional node after updating the first node. for unnecessary_w_t in text[1:]: first_w_r.remove(unnecessary_w_t) # if there are multiple runs between "separate" and "end" they # all may contain a piece of the cached docproperty value. we # can't reliably handle this situation and only update the # first node in the first run with the full cached value. it # appears any additional runs with text nodes should then be # removed to avoid duplicating parts of the cached docproperty # value. for w_r in runs[1:]: text = xpath(w_r, u'.//w:t') if text: w_p.remove(w_r)
def get(self, name): """Get the value of a property.""" prop = xpath( self._element, u'.//cp:property[@name="{}"]'.format(name)) if prop: value = list(prop[0])[0] if value.tag.endswith(u'}lpwstr'): return value.text elif value.tag.endswith(u'}i4'): return int(value.text) elif value.tag.endswith(u'}bool'): if value.text.lower() == u'true': return True else: return False elif value.tag.endswith(u'}filetime'): return CT_CoreProperties._parse_W3CDTF_to_datetime(value.text)
def test_complex_field_gets_updated(self): document = Document(docx_path('docproperties.docx')) assert 6 == len(document.paragraphs), 'input file should contain 6 paragraphs' properties = xpath(document.element.body, './/w:instrText') assert 5 == len(properties),\ 'input should contain five complex field docproperties' expected_paragraphs = [u'Custom Doc Properties', u'Text: Foo Bar', u'Number: 123', u'Boolean: Y', u'Date: 11.06.2019', u'Float: 1.1'] actual_paragraphs = [paragraph.text for paragraph in document.paragraphs] assert actual_paragraphs == expected_paragraphs CustomProperties(document).update("Number Property", 423) expected_paragraphs[2] = u'Number: 423' actual_paragraphs = [paragraph.text for paragraph in document.paragraphs] assert actual_paragraphs == expected_paragraphs
def add_images(self, doc, element): """Add images from the given document used in the given element.""" blips = xpath(element, '(.//a:blip|.//asvg:svgBlip)[@r:embed]') for blip in blips: rid = blip.get('{%s}embed' % NS['r']) img_part = doc.part.rels[rid].target_part new_img_part = self.pkg.image_parts._get_by_sha1(img_part.sha1) if new_img_part is None: image = ImageWrapper(img_part) new_img_part = self.pkg.image_parts._add_image_part(image) new_rid = self.doc.part.relate_to(new_img_part, RT.IMAGE) blip.set('{%s}embed' % NS['r'], new_rid) # handle external reference as images can be embedded and have an # external reference rid = blip.get('{%s}link' % NS['r']) if rid: rel = doc.part.rels[rid] new_rel = self.add_relationship(None, self.doc.part, rel) blip.set('{%s}link' % NS['r'], new_rel.rId)
def values(self): if self._element is None: return [] props = xpath(self._element, u'.//cp:property') return [vt2value(prop[0]) for prop in props]
def keys(self): if self._element is None: return [] props = xpath(self._element, u'.//cp:property') return [prop.get('name') for prop in props]
src_func(key5) composer = MyComposer(Document()) # composer.append(Document(root_path+""+".docx")) composer.replace(Document(root_path + "/公有云服务介绍" + ".docx")) composer.replace(Document(root_path + "/机器预填【部分】" + ".docx")) dest_file_path = root_path + "/机器预填【部分】.docx" docx_obj = Document(dest_file_path) gindex = 1 for element in docx_obj.element.body: if isinstance(element, CT_SectPr): continue else: # print(type(element)) bookmarks_start = xpath(element, './/w:sdt') if bookmarks_start: children = element.getchildren() for child in children: if child.tag == "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}alias": # index = int(bookmarks_start[0].get( # "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}")) key = bookmarks_start[0].get( "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val" ) print(key) src_ee = srcs.get(key) if src_ee is not None: element.remove(child) # bookmarks_start = [src_element] src_ees = srcs.get(key)
def items(self): if self._element is None: return [] props = xpath(self._element, u'.//cp:property') return [(prop.get('name'), vt2value(prop[0])) for prop in props]
def tags_by_alias(self, alias): """Get Structured Document Tags by alias.""" return xpath( self.doc.element.body, './/w:sdt/w:sdtPr/w:alias[@w:val="%s"]/ancestor::w:sdt' % alias)
def update(self, value): text = xpath(self.node, './/w:t') if text: text[0].text = self._format_value(value)
def restart_first_numbering(self, doc, element): if not self.restart_numbering: return style_id = xpath(element, './/w:pStyle/@w:val') if not style_id: return style_id = style_id[0] if style_id in self._numbering_restarted: return style_element = self.doc.styles.element.get_by_id(style_id) if style_element is None: return outline_lvl = xpath(style_element, './/w:outlineLvl') if outline_lvl: # Styles with an outline level are probably headings. # Do not restart numbering of headings return # if there is a numId referenced from the paragraph, that numId is # relevant, otherwise fall back to the style's numId local_num_id = xpath(element, './/w:numPr/w:numId/@w:val') if local_num_id: num_id = local_num_id[0] else: style_num_id = xpath(style_element, './/w:numId/@w:val') if not style_num_id: return num_id = style_num_id[0] numbering_part = self.numbering_part() num_element = xpath(numbering_part.element, './/w:num[@w:numId="%s"]' % num_id) if not num_element: # Styles with no numbering element should not be processed return anum_id = xpath(num_element[0], './/w:abstractNumId/@w:val')[0] anum_element = xpath( numbering_part.element, './/w:abstractNum[@w:abstractNumId="%s"]' % anum_id) num_fmt = xpath(anum_element[0], './/w:lvl[@w:ilvl="0"]/w:numFmt/@w:val') # Do not restart numbering of bullets if num_fmt and num_fmt[0] == 'bullet': return new_num_element = deepcopy(num_element[0]) lvl_override = parse_xml( '<w:lvlOverride xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"' ' w:ilvl="0"><w:startOverride w:val="1"/></w:lvlOverride>') new_num_element.append(lvl_override) next_num_id, next_anum_id = self._next_numbering_ids() new_num_element.numId = next_num_id self._insert_num(new_num_element) paragraph_props = xpath( element, './/w:pPr/w:pStyle[@w:val="%s"]/parent::w:pPr' % style_id) num_pr = xpath(paragraph_props[0], './/w:numPr') if num_pr: num_pr = num_pr[0] previous_num_id = num_pr.numId.val self._replace_mapped_num_id(previous_num_id, next_num_id) num_pr.numId.val = next_num_id else: num_pr = parse_xml( '<w:numPr xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">' '<w:ilvl w:val="0"/><w:numId w:val="%s"/></w:numPr>' % next_num_id) paragraph_props[0].append(num_pr) self._numbering_restarted.add(style_id)
def begin_run(self): begins = xpath(self.w_r, self.XPATH_PRECEDING_BEGINS) if not begins: msg = "Complex field without begin node is not supported" raise InvalidComplexField(msg) return begins[-1]
def remove_header_and_footer_references(self, doc, element): refs = xpath(element, './/w:headerReference|.//w:footerReference') for ref in refs: ref.getparent().remove(ref)
def _runs(self): return xpath(self.begin_run, "./following-sibling::w:r")
def add_styles(self, doc, element): """Add styles from the given document used in the given element.""" our_style_ids = [s.style_id for s in self.doc.styles] # de-duplicate ids and keep order to make sure tests are not flaky used_style_ids = list( OrderedDict.fromkeys([ e.val for e in xpath(element, './/w:tblStyle|.//w:pStyle|.//w:rStyle') ])) for style_id in used_style_ids: our_style_id = self.mapped_style_id(style_id) if our_style_id not in our_style_ids: style_element = deepcopy( doc.styles.element.get_by_id(style_id)) self.doc.styles.element.append(style_element) self.add_numberings(doc, style_element) # Also add linked styles linked_style_ids = xpath(style_element, './/w:link/@w:val') if linked_style_ids: linked_style_id = linked_style_ids[0] our_linked_style_id = self.mapped_style_id(linked_style_id) if our_linked_style_id not in our_style_ids: our_linked_style = doc.styles.element.get_by_id( linked_style_id) self.doc.styles.element.append( deepcopy(our_linked_style)) else: # Create a mapping for abstractNumIds used in existing styles # This is used when adding numberings to avoid having multiple # <w:abstractNum> elements for the same style. style_element = doc.styles.element.get_by_id(style_id) if style_element is not None: num_ids = xpath(style_element, './/w:numId/@w:val') if num_ids: anum_ids = xpath( doc.part.numbering_part.element, './/w:num[@w:numId="%s"]/w:abstractNumId/@w:val' % num_ids[0]) if anum_ids: our_style_element = self.doc.styles.element.get_by_id( our_style_id) our_num_ids = xpath(our_style_element, './/w:numId/@w:val') if our_num_ids: numbering_part = self.numbering_part() our_anum_ids = xpath( numbering_part.element, './/w:num[@w:numId="%s"]/w:abstractNumId/@w:val' % our_num_ids[0]) if our_anum_ids: self.anum_id_mapping[int( anum_ids[0])] = int(our_anum_ids[0]) # Replace language-specific style id with our style id if our_style_id != style_id and our_style_id is not None: style_elements = xpath( element, './/w:tblStyle[@w:val="%(styleid)s"]|' './/w:pStyle[@w:val="%(styleid)s"]|' './/w:rStyle[@w:val="%(styleid)s"]' % dict(styleid=style_id)) for el in style_elements: el.val = our_style_id # Update our style ids our_style_ids = [s.style_id for s in self.doc.styles]