def it_can_add_text(self, add_text_fixture, Text_): r, text_str, expected_xml = add_text_fixture run = Run(r, None) _text = run.add_text(text_str) assert run._r.xml == expected_xml assert _text is Text_.return_value
def runs(self): """ Sequence of |Run| instances corresponding to the <w:r> elements in this paragraph. """ lst = [Run(r, self) for r in self._p.r_lst] for hl in self._p: if hl.tag == qn('w:hyperlink'): for r in hl: if r.tag == qn('w:r'): lst.append(Run(r, self)) return lst
def addToDoc(self, run: Run): #img = Image.open(replace) #img_with_border = ImageOps.expand(img, border=1, fill='black') #imgByteArr = BytesIO() #img_with_border.save(imgByteArr, format='PNG') #replace = imgByteArr try: run.text = run.text.replace(search, "") run.add_picture(self.path, width=self.width) except Exception as e: print(f"Error adding picture, {self.path}") print(e)
def compile_data(self): html = '<p>' prev_p = None for i, el in enumerate(self._elements): paragraph = None if type(el) == NestedElement: paragraph = el._elements[0][1] if el.type == 'list': html_ = parse_list(el, self.section) elif el.type == 'hyperlink': html_ = parse_hyperlink(el, self.section) elif el.type == 'footnote': html_ = parse_footnote(el) elif el.type == 'break': html_ = parse_break(el, html) else: child, paragraph = el run = Run(child, paragraph) html_ = parse_paragraph(run, paragraph) if prev_p is not None and prev_p != paragraph: html += '</p><p>' html += html_ prev_p = paragraph html += '</p>' self.data['html'] = html.replace('<p></p>', '') if self.data['html'] == '': self.skip = True
def compile_data(self): text = '' for child, paragraph in self._elements: run = Run(child, paragraph) text += run.text self.data['text'] = text
def parse_table(table, section): data = [] for row in table.rows: r = [] for cell in row.cells: text = '' for p in cell.paragraphs: p_text = '' children = p._element.getchildren() for i in range(len(children)): child = children[i] tag = TAGS.get(child.tag, None) linkId = None if tag is None: continue #keep inner breaks but discard precontent breaks . if tag == 'break' and i == 0: continue elif tag == 'hyperlink': linkId = childishyperlink(child) child = child.find('.//w:r', NAMESPACES) run = Run(child, p) footnote = runisfootnote(run) if footnote: p_text += '{{%s}}' % footnote elif tag == 'hyperlink': # markdown link p_text += '[%s](%s)' % (run.text, section._links.get(linkId, None)) else: if run.text is not None: p_text += run.text #add line breaks between paragraphs. only if muli-paragraphs in cell #and discard trailing line break if (len(cell.paragraphs) > 1 and i != len(cell.paragraphs) - 1): p_text += '<br/>' text += p_text r.append(text) data.append(r) return { 'data': data, 'first_row_is_table_header': True }
def add_picture_fixture(self, part_prop_, document_part_, InlineShape_, picture_): run = Run(element('w:r/wp:x'), None) image = 'foobar.png' width, height, inline = 1111, 2222, element('wp:inline{id=42}') expected_xml = xml('w:r/(wp:x,w:drawing/wp:inline{id=42})') document_part_.new_pic_inline.return_value = inline InlineShape_.return_value = picture_ return (run, image, width, height, inline, expected_xml, InlineShape_, picture_)
def parse_hyperlink(el, section): html_ = '' url = '' for e in el._elements: child, paragraph, linkId = e url = section._links.get(linkId, None) run = Run(child, paragraph) html_ += parse_paragraph(run, paragraph) return '<a href="%s">%s</a>' % (url, html_)
def parse_list(el, section): listtype = el._elements[0][2][0] list = '' items = '' item = '' for i, e in enumerate(el._elements): child, paragraph, attrs = e type_, lvl = attrs tag = TAGS.get(child.tag, None) linkId = None if tag == 'break' or tag == None: continue elif tag == 'hyperlink': linkId = childishyperlink(child) child = child.find('.//w:r', NAMESPACES) run = Run(child, paragraph) p = parse_paragraph(run, paragraph) if tag == 'hyperlink': p = '<a href="%s">%s</a>' % (section._links.get(linkId, None), p) item += p footnote = runisfootnote(run) if footnote: item += '{{%s}}' % footnote add = False nest = False if i == len(el._elements)-1: add = True if lvl == '1': nest = True elif el._elements[i+1][1] != paragraph: add = True if lvl == '1' and el._elements[i+1][2][1] == '0': nest = True if add: item = '<li>%s</li>' % item items += item item = '' if nest: items = u"<{0}>{1}</{0}>".format(type_, items) list += items items = '' elif add and lvl == '0': list += items items = '' return u"<{0}>{1}</{0}>".format(listtype, list)
def build_c_run(c, para, run): c_run_el = para._element._new_r() run._element.addnext(c_run_el) c_run = Run(c_run_el, run._parent) c_run.text = c c_run.style = run.style c_run.italic = run.italic c_run.bold = run.bold c_run.underline = run.underline c_run.font.name = run.font.name c_run.font.size = run.font.size return c_run
def replace_simple_field(self, field, replacement): # a fldSimple tag is easily replaced, we just create a new run in the same paragraph and replace that one # with the fldSimple node. parent_node = field.getparent() # the standard says that this is the case most of the time so we only deal with this case for now: assert parent_node.tag == namespaced('p') current_paragraph = Paragraph(parent_node, self._body) replacement_run = Run(current_paragraph._p._add_r(), current_paragraph) parent_node.replace(field, replacement_run._element) replacement.fill(replacement_run)
def given_a_run_having_mixed_text_content(context): """ Mixed here meaning it contains ``<w:tab/>``, ``<w:cr/>``, etc. elements. """ r_xml = """\ <w:r %s> <w:t>abc</w:t> <w:tab/> <w:t>def</w:t> <w:cr/> <w:t>ghi</w:t> <w:drawing/> <w:br/> <w:t>jkl</w:t> </w:r>""" % nsdecls('w') r = parse_xml(r_xml) context.run = Run(r, None)
def replace_complex_field(self, field, replacement): # fldChar is more complex. it's not a tag, but rather a series of fldChar and instrText tags inside separate # runs. The tags that concern us are these: # # 1. <w:fldChar w:fldCharType="begin"/>: The beginning of the field # 2. <w:instrText xml:space="preserve"> MERGEFIELD test \* MERGEFORMAT </w:instrText> contains the field's name # 3. <w:fldChar w:fldCharType="end"/> Marks the end of the field # # get the run this field is in instr_run_node = field.getparent() assert instr_run_node.tag == namespaced('r') # we now look for the run containing of the opening fldChar for this instrText, which is the first one # with an opening fldChar we encounter before the run with instrText opening_run_node = instr_run_node while not opening_run_node.xpath('w:fldChar[@w:fldCharType="begin"]'): opening_run_node = opening_run_node.getprevious() if opening_run_node is None: raise ValueError( "Could not find beginning of field with instr node '{}'?! Is the document malformed?" .format(field)) # idem for the run containing the closing fldChar, but of course now looking ahead closing_run_node = instr_run_node while not closing_run_node.xpath('w:fldChar[@w:fldCharType="end"]'): closing_run_node = closing_run_node.getnext() if closing_run_node is None: raise ValueError( "Could not find end of field with instr node '{}'?! Is the document malformed?" .format(field)) # now replace all runs between the opening and closing runs current_paragraph = Paragraph(instr_run_node.getparent(), self._body) begin_index = current_paragraph._element.index(opening_run_node) end_index = current_paragraph._element.index(closing_run_node) run = Run(current_paragraph._p._add_r(), current_paragraph) current_paragraph._element[begin_index:end_index] = [run._element] replacement.fill(run)
def _convert_paragraph(self, p: Paragraph): children = list(p._element) run_lst = [] for child in children: if isinstance(child, CT_R2): run = Run(child, p) if 'ProgID="Equation.DSMT' in run.element.xml: v = self._convert_equation(run) if len(v) == 1: run_lst.append(v) else: run_lst.append(r'<span class="math-tex" data-latex="{0}">\({0}\)</span>'.format(html.escape(v))) else: run_lst.append(self._convert_run(run)) elif isinstance(child, (CT_OMath, CT_OMathPara)): run_lst.append(self._convert_omath(child)) elif isinstance(child, CT_Tbl): run_lst.append(self._convert_table(child)) run_lst.append(self._convert_omath(p._element)) return ''.join(run_lst)
def get_block_type(child, paragraph): type_ = type(child) if type_ == Table: return (TableBlock, 'table', None, None) tag = TAGS.get(child.tag, None) run = Run(child, paragraph) list = paragraphlisttype(paragraph) isbreak = childisbreak(child) hyperlink = childishyperlink(child) footnote = runisfootnote(run) if tag is None: return (None, None, None, None) elif runisfigure(run): return (InlineImageBlock, 'inline_image', None, None) elif isbreak and not list: return (ParagraphBlock, 'paragraph', 'break', None) elif list and not isbreak: return (ParagraphBlock, 'paragraph', 'list', list) elif isbreak: return (None, None, None, None) elif run.text == '' and not runisfootnote(run): return (None, None, None, None) elif hyperlink: return (ParagraphBlock, 'paragraph', 'hyperlink', hyperlink) elif paragraph.style.name == 'Heading 2': if runisboxstart(run): return (BoxBlock, 'boxstart', None, None) if runisboxend(run): return (None, 'boxend', None, None) else: return (HeadingBlock, 'heading', None, None) elif footnote: return (ParagraphBlock, 'paragraph', 'footnote', footnote) else: return (ParagraphBlock, 'paragraph', None, None)
def style_get_fixture(self, part_prop_): style_id = 'Barfoo' r_cxml = 'w:r/w:rPr/w:rStyle{w:val=%s}' % style_id run = Run(element(r_cxml), None) style_ = part_prop_.return_value.get_style.return_value return run, style_id, style_
def add_text_fixture(self, request, Text_): r_cxml, text, expected_cxml = request.param run = Run(element(r_cxml), None) expected_xml = xml(expected_cxml) return run, text, expected_xml, Text_
def convert(source_path, out_path, short_name, cite, year): ### TODO: # whitelist allowed tags # replace paragraph with .5 inch indented first line with a tab ### known changes: # tighter character spacing? # footnote numbers bold? # no space after footnote number? ### LOAD DATA ### # load docs source_doc, source_pq = load_doc(source_path) template_doc, template_pq = load_doc(template_path) # load footnotes footnotes_part, footnotes_el, footnotes_pq = load_part( source_doc.part.part_related_by(RT.FOOTNOTES)) template_footnotes_part, template_footnotes_el, template_footnotes_pq = load_part( template_doc.part.part_related_by(RT.FOOTNOTES)) ### COPY STYLES FROM TEMPLATE ### # copy styles, settings, and section formatting from template doc replace_element_contents(template_doc.styles._element, source_doc.styles._element) replace_element_contents(template_doc.settings._element, source_doc.settings._element) replace_element_contents( template_pq('w|sectPr')[0], source_pq('w|sectPr')[0]) replace_element_contents( template_footnotes_pq('w|footnote').children()[0], footnotes_pq('w|footnote').children() [0]) # first footnote is the footnote separator ### HEADERS ### # delete existing header parts and copy in new header parts for rId, rel in list(source_doc.part.rels.items()): if rel.reltype == RT.HEADER: del source_doc.part.rels[rId] update_refs = {} header_parts = [] for rId, rel in template_doc.part.rels.items(): if rel.reltype == RT.HEADER: new_id = source_doc.part.rels._next_rId update_refs[rId] = new_id header_parts.append(load_part(rel.target_part)) source_doc.part.rels.add_relationship(RT.HEADER, rel.target_part, new_id) source_doc.part.package.parts.append(rel.target_part) # update header references for header_ref in source_pq('w|headerReference'): header_ref.attrib[qn('r:id')] = update_refs[header_ref.attrib[qn( 'r:id')]] # fill in header values for header_part, header_el, header_pq in header_parts: header_pq("w|rStyle[w|val='HeaderYear']").closest('w|r')('w|t').text( year) header_pq("w|rStyle[w|val='HeaderCitation']").closest('w|r')( 'w|t').text(cite) short_name_par = Paragraph( header_pq("w|pStyle[w|val='HeaderCaseName']").closest('w|p')[0], None) short_name_par.clear() # italicize v. in party name if ' v. ' in short_name: party_a, party_b = short_name.split(' v. ', 2) short_name_par.add_run(party_a) vs_run = short_name_par.add_run(' v. ') vs_run.italic = True short_name_par.add_run(party_b) else: short_name_par.add_run(short_name) # set starting page number starting_page_number = cite.rsplit(' ', 1)[-1] source_pq('w|sectPr').append( make_el( source_pq('w|sectPr')[0], 'w:pgNumType', {'w:start': starting_page_number})) ### TYPOGRAPHY ### # apply typography changes to body text and footnotes, adjusting variables that are different for query, allowed_styles, section_name, blockquote_style_name in ( (source_pq, ('FootnoteReference', ), 'body', 'Blockquote'), (footnotes_pq, ('FootnoteText', 'FootnoteSeparator', 'FootnoteReference'), 'footnote', 'FootnoteBlockquote')): # clear existing styles ignore_removed_styles = ('NormalWeb', ) for style_tag in query('w|pStyle,w|rStyle'): style_name = style_tag.attrib.get(qn('w:val')) if style_name not in allowed_styles: if style_name not in ignore_removed_styles: print("Warning: removing unrecognized %s style %s." % (section_name, style_name)) remove_el(style_tag) # mark block quotes for par in query('w|ind[w|left="720"]'): if qn('w:hanging') not in par.attrib: par = pq(par).closest('w|p')[0] par.style = blockquote_style_name # remove fonts and sizes remove_tags = ('sz', 'szCs', 'rFonts', 'ind', 'spacing', 'proofErr', 'bookmarkStart', 'bookmarkEnd', 'color[w|val="000000"]', 'lastRenderedPageBreak') for tag in remove_tags: query('w|' + tag).remove() # underline to italic for el in query('w|u'): if el.attrib.get(qn('w:val')) == 'double': el.tag = qn('w:smallCaps') else: el.tag = qn('w:i') el.attrib.clear() # combine consecutive runs with identical formatting query('w|t').attr( qn('xml:space'), 'preserve') # add preserve to all t blocks for uniformity skip = 0 for run in query('w|r'): # skip runs that have already been appended to previous run and detached if skip: skip -= 1 continue blank_r1 = blank_run(run) while True: r2 = pq(run).next() if not r2: break r2 = r2[0] if r2.tag != run.tag or etree.tostring( blank_r1) != etree.tostring(blank_run(r2)): break run.text += r2.text remove_el(r2) skip += 1 # text replacements for t in query('w|t'): text = t.text # fix dashes text = text.replace(" -- ", " — ") # remove double spaces text = re.sub(' +', ' ', text) # fix quotes for straight_quote, left_quote, right_quote in (('"', '“', '”'), ("'", '‘', '’')): if straight_quote not in text: continue # right smart quotes text = re.sub(r'([a-zA-Z0-9.,?!;:\'\"])%s' % straight_quote, r'\1%s' % right_quote, text) text = re.sub(r'%s ' % straight_quote, r'%s ' % right_quote, text) # remaining are left smart quotes text = text.replace(straight_quote, left_quote) t.text = text ### FOOTNOTES ### footnote_tab = deepcopy( template_footnotes_pq('w|footnote:not([w|type]) w|r') [0]) # first run in template footnotes is a tab for footnote in footnotes_pq('w|footnote:not([w|type])'): # remove extra tabs from footnotes, add single tab for run in pq(footnote, namespaces=nsmap)('w|r'): if pq(run, namespaces=nsmap)('w|tab'): remove_el(run) else: pq(run).before(deepcopy(footnote_tab)) break # make sure footnotes have FootnoteText style for par in pq(footnote, namespaces=nsmap)('w|p'): if not par.style: par.style = 'FootnoteText' ### CAPTION ### def skip_blanks(paragraphs, par_num): par_num += 1 while not has_text(paragraphs[par_num]): par_num += 1 return par_num # delete first four paragraphs pq(source_pq('w|p')[:4]).remove() paragraphs = source_pq('w|p') # format first paragraph par_num = 0 paragraphs[par_num].style = 'CaseName' # process the case name so all-caps becomes small-caps: for run in pq(paragraphs[par_num])('w|r'): parts = re.split(r'([A-Z][A-Z]+)', run.text) if len(parts) > 1: new_runs = split_run(run, parts) for new_run in new_runs[1::2]: # every other part will be all-caps, so should become small-caps Run(new_run, None).font.small_caps = True new_run.text = new_run.text.title() par_num = skip_blanks(paragraphs, par_num) paragraphs[par_num].style = 'Dates' par_num = skip_blanks(paragraphs, par_num) paragraphs[par_num].style = 'Judges' par_num = skip_blanks(paragraphs, par_num) paragraphs[par_num].style = 'Categories' par_num = skip_blanks(paragraphs, par_num) while has_text(paragraphs[par_num]): paragraphs[par_num].style = 'Headnote' par_num += 2 # extra space for last headnote Paragraph(paragraphs[par_num - 2], None).paragraph_format.space_after = Pt(12) par_num = skip_blanks(paragraphs, par_num) while has_text(paragraphs[par_num]): paragraphs[par_num].style = 'History' par_num += 2 par_num = skip_blanks(paragraphs, par_num) while has_text(paragraphs[par_num]): paragraphs[par_num].style = 'Appearance' par_num += 1 # mark author name -- first sentence of first paragraph of case text par_num = skip_blanks(paragraphs, par_num) first_paragraph = Paragraph(paragraphs[par_num], source_doc._body) try: first_run = next(r for r in first_paragraph.runs if r.text.strip()) first_run, second_run = split_run(first_run._element, first_run.text.split('.', 1)) first_run.text = first_run.text.title() + "." Run(first_run, first_paragraph).style = "Author" except Exception as e: print("Warning: failed to detect author name. Searched this text: %s" % first_paragraph.text) raise # remove blank paragraphs # this has to come AFTER caption processing so we can tell sections apart for query in (source_pq, footnotes_pq('w|footnote:not([w|type])')): for p in query('w|p'): if not has_text(p): remove_el(p) ### HEADNOTE PAGE RANGES ### # replace highlighted headnote markers with bookmarks bookmarks = [] for i, highlight_run in enumerate( source_pq("w|highlight[w|val='yellow']")): highlight_run = pq(highlight_run).closest('w|r') bookmark_name = "Headnote%s%s" % ("End" if i % 2 else "Start", int(i / 2)) highlight_run.after( pq([ make_el(highlight_run[0], "w:bookmarkStart", { "w:id": str(i), "w:name": bookmark_name }), make_el(highlight_run[0], "w:bookmarkEnd", {"w:id": str(i)}) ])) remove_el(highlight_run[0]) bookmarks.append(bookmark_name) # replace headnote page number references with bookmark shortcodes reference_template = """ <w:fldSimple w:instr=" PAGEREF {bookmark_start} "> <w:r><w:rPr><w:noProof/></w:rPr><w:t>PRINT</w:t></w:r> </w:fldSimple> <w:r><w:fldChar w:fldCharType="begin"/></w:r> <w:r><w:instrText xml:space="preserve"> IF </w:instrText></w:r> <w:r><w:fldChar w:fldCharType="begin"/></w:r> <w:r><w:instrText xml:space="preserve"> PAGEREF {bookmark_start} </w:instrText></w:r> <w:r><w:fldChar w:fldCharType="separate"/></w:r> <w:r><w:rPr><w:noProof/></w:rPr><w:instrText>PRINT</w:instrText></w:r> <w:r><w:fldChar w:fldCharType="end"/></w:r> <w:r><w:instrText xml:space="preserve"> = </w:instrText></w:r> <w:r><w:fldChar w:fldCharType="begin"/></w:r> <w:r><w:instrText xml:space="preserve"> PAGEREF {bookmark_end} </w:instrText></w:r> <w:r><w:fldChar w:fldCharType="separate"/></w:r> <w:r><w:rPr><w:noProof/></w:rPr><w:instrText>PRINT</w:instrText></w:r> <w:r><w:fldChar w:fldCharType="end"/></w:r> <w:r><w:instrText xml:space="preserve"> "" "-</w:instrText></w:r> <w:r><w:fldChar w:fldCharType="begin"/></w:r> <w:r><w:instrText xml:space="preserve"> PAGEREF {bookmark_end} </w:instrText></w:r> <w:r><w:fldChar w:fldCharType="separate"/></w:r> <w:r><w:rPr><w:noProof/></w:rPr><w:instrText>PRINT</w:instrText></w:r> <w:r><w:fldChar w:fldCharType="end"/></w:r> <w:r><w:instrText>"</w:instrText></w:r> <w:r><w:fldChar w:fldCharType="end"/></w:r> """ for headnote in source_pq('w|pStyle[w|val="Headnote"]'): for run in pq(headnote).closest('w|p')('w|r'): run = pq(run) parts = re.split(r'\[.*?\]', run('w|t').text()) if len(parts) > 1: new_els = [] for i, part in enumerate(parts): if i != 0: new_els.extend( parse_xml_fragment( run[0], reference_template.format( bookmark_start=bookmarks.pop(0), bookmark_end=bookmarks.pop(0)))) new_run = deepcopy(run[0]) pq(new_run)('w|t').text(("]" if i != 0 else "") + part + ( "[" if i != len(parts) - 1 else "")) new_els.append(new_run) run.after(pq(new_els)) remove_el(run[0]) ### OUTPUT ### # write footnotes and headers save_part(footnotes_el, footnotes_part) for header_part, header_el, header_pq in header_parts: save_part(header_el, header_part) # save output #save_xml(out_path, source_doc) source_doc.save(out_path)
def add_break_fixture(self, request): break_type, expected_cxml = request.param run = Run(element('w:r'), None) expected_xml = xml(expected_cxml) return run, break_type, expected_xml
def bool_prop_get_fixture(self, request): r_cxml, bool_prop_name, expected_value = request.param run = Run(element(r_cxml), None) return run, bool_prop_name, expected_value
def add_tab_fixture(self, request): r_cxml, expected_cxml = request.param run = Run(element(r_cxml), None) expected_xml = xml(expected_cxml) return run, expected_xml
def underline_raise_fixture(self, request): invalid_underline_setting = request.param run = Run(element('w:r/w:rPr'), None) return run, invalid_underline_setting
def bool_prop_set_fixture(self, request): initial_r_cxml, bool_prop_name, value, expected_cxml = request.param run = Run(element(initial_r_cxml), None) expected_xml = xml(expected_cxml) return run, bool_prop_name, value, expected_xml
def underline_get_fixture(self, request): r_cxml, expected_underline = request.param run = Run(element(r_cxml), None) return run, expected_underline
def underline_set_fixture(self, request): initial_r_cxml, new_underline, expected_cxml = request.param run = Run(element(initial_r_cxml), None) expected_xml = xml(expected_cxml) return run, new_underline, expected_xml
def text_set_fixture(self, request): new_text, expected_cxml = request.param initial_r_cxml = 'w:r/w:t"should get deleted"' run = Run(element(initial_r_cxml), None) expected_xml = xml(expected_cxml) return run, new_text, expected_xml
def text_get_fixture(self, request): r_cxml, expected_text = request.param run = Run(element(r_cxml), None) return run, expected_text
def style_set_fixture(self, request, part_prop_): r_cxml, value, style_id, expected_cxml = request.param run = Run(element(r_cxml), None) part_prop_.return_value.get_style_id.return_value = style_id expected_xml = xml(expected_cxml) return run, value, expected_xml
def clear_fixture(self, request): initial_r_cxml, expected_cxml = request.param run = Run(element(initial_r_cxml), None) expected_xml = xml(expected_cxml) return run, expected_xml
def font_fixture(self, Font_, font_): run = Run(element('w:r'), None) return run, Font_, font_