def draw_like(paragraph: Paragraph, words: List[str], bold=False): words = random.sample(words, 2) for word in words: run = paragraph.add_run(word) run.bold = bold run2 = paragraph.add_run(bracket * 3) run2.bold = bold paragraph.add_run(space)
def insert_paragraph_after(self, paragraph, text=None, style=None): new_p = OxmlElement("w:p") paragraph._p.addnext(new_p) new_para = Paragraph(new_p, paragraph._parent) if text: new_para.add_run(text) if style is not None: new_para.style = style return new_para
def _insert_paragraph_after(paragraph, text=None, style=None): """Insert a new paragraph after the given paragraph.""" new_paragraph_oxml = OxmlElement("w:p") paragraph._p.addnext(new_paragraph_oxml) new_paragraph = Paragraph(new_paragraph_oxml, paragraph._parent) if text: new_paragraph.add_run(text) if style is not None: new_paragraph.style = style return new_paragraph
def insert_paragraph_after(inparagraph, text=None, style=None): """Insert a new paragraph after the given paragraph.""" new_p = OxmlElement("w:p") inparagraph._p.addnext(new_p) new_para = Paragraph(new_p, inparagraph._parent) if text is not None: new_para.add_run(text) if style is not None: new_para.style = style return new_para
def __init__(self, cif: CifContainer, paragraph: Paragraph): self.cif = cif self.crytsalization_method = gstr( self.cif['_exptl_crystal_recrystallization_method']) if not self.crytsalization_method: self.crytsalization_method = '[No crystallization method was given]' sentence = "{}. " self.text = sentence.format( remove_line_endings( retranslate_delimiter(self.crytsalization_method))) paragraph.add_run(retranslate_delimiter(self.text))
def add_r_int_value(cif: CifContainer, rint_p: Paragraph): reflns_number_total = cif['_reflns_number_total'] reflns_av_R_equivalents = cif['_diffrn_reflns_av_R_equivalents'] reflns_av_unetI = cif['_diffrn_reflns_av_unetI/netI'] rint_p.add_run(this_or_quest(reflns_number_total) + '\n') rint_p.add_run('R').font.italic = True rint_p.add_run('int').font.subscript = True rint_p.add_run(' = ' + this_or_quest(reflns_av_R_equivalents) + '\n') rint_p.add_run('R').font.italic = True rint_p.add_run('sigma').font.subscript = True rint_p.add_run(' = ' + this_or_quest(reflns_av_unetI))
def __init__(self, cif: CifContainer, paragraph: Paragraph): self.cif = cif self.dsr_sentence = '' sentence1 = "Disordered moieties were refined using bond lengths " \ "restraints and displacement parameter restraints. " if self.cif.dsr_used: self.dsr_sentence = "Some parts of the disorder model were introduced by the " \ "program DSR." paragraph.add_run(sentence1) if self.dsr_sentence: paragraph.add_run(self.dsr_sentence)
def format_space_group(paragraph: Paragraph, space_group: str, it_number: str) -> None: """ Sets formatting of the space group symbol in row 6 of the report table. """ try: # The HM space group type s = SpaceGroups() spgrxml = s.to_mathml(space_group) paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT paragraph._element.append(math_to_word(spgrxml)) paragraph.add_run(' (' + it_number + ')') except Exception: paragraph.add_run(space_group)
def __init__(self, cif: CifContainer, paragraph: Paragraph, ref: ReferenceList): self.cif = cif ccdc_num = gstr( self.cif['_database_code_depnum_ccdc_archive']) or '??????' sentence1 = "Crystallographic data for the structures reported in this " \ "paper have been deposited with the Cambridge Crystallographic Data Centre." sentence2 = "CCDC {} contain the supplementary crystallographic data for this paper. " \ "These data can be obtained free of charge from The Cambridge Crystallographic Data Centre " \ "via www.ccdc.cam.ac.uk/{}structures.".format(ccdc_num, zero_width_space) paragraph.add_run(sentence1) ref.append(CCDCReference()) SpaceChar(paragraph).regular() paragraph.add_run(sentence2)
def __init__(self, cif: CifContainer, paragraph: Paragraph): self.cif = cif self.temperature = gstr(self.cif['_diffrn_ambient_temperature']) self._name = cif.fileobj.name method = 'shock-cooled ' sentence = "The data for {} were collected from a {}single crystal at {}{}K " try: if float(self.temperature.split('(')[0]) > 200: method = '' except ValueError: method = '' self.txt = sentence.format(self.cif.block.name, method, self.temperature, protected_space) paragraph.add_run(retranslate_delimiter(self.txt))
def draw_one_multi_pron(paragraph: Paragraph, letter: str, bold=False): run1 = paragraph.add_run(space) run1.bold = bold run1.font.underline = True paragraph.add_run('\n') run2 = paragraph.add_run(letter) run2.bold = bold paragraph.add_run('\n') run3 = paragraph.add_run(bracket) run3.bold = bold paragraph.add_run('\n')
def insert_paragraph_after(inparagraph, text=None, style=None): """Insert a new paragraph after the given paragraph.""" new_p = OxmlElement("w:p") inparagraph._p.addnext(new_p) new_para = Paragraph(new_p, inparagraph._parent) if text != None: new_para.style = template.styles[style] run = new_para.add_run(text) return new_para
def __init__(self, cif: CifContainer, paragraph: Paragraph, ref: ReferenceList): self.cif = cif integration = gstr(self.cif['_computing_data_reduction']) or '??' abstype = gstr(self.cif['_exptl_absorpt_correction_type']) or '??' abs_details = gstr(self.cif['_exptl_absorpt_process_details']) or '??' data_reduct_ref = DummyReference() absorpt_ref = DummyReference() integration_prog = '[unknown integration program]' scale_prog = '[unknown program]' if 'SAINT' in integration: data_reduct_ref, integration_prog = self.add_saint_reference( integration) if 'CrysAlisPro'.lower() in integration.lower(): data_reduct_ref, absorpt_ref, integration_prog = self.add_crysalispro_reference( integration) absdetails = cif['_exptl_absorpt_process_details'].replace('-', ' ') if 'SADABS' in absdetails.upper() or 'TWINABS' in absdetails.upper(): if len(absdetails.split()) > 1: version = absdetails.split()[1] else: version = 'unknown version' if 'SADABS' in absdetails: scale_prog = 'SADABS' else: scale_prog = 'TWINABS' # absorpt_ref = SAINTReference(scale_prog, version) absorpt_ref = SadabsTwinabsReference() if 'SORTAV' in absdetails.upper(): scale_prog = 'SORTAV' absorpt_ref = SORTAVReference() if 'crysalis' in abs_details.lower(): scale_prog = 'SCALE3 ABSPACK' sentence = 'All data were integrated with {} and {} {} absorption correction using {} was applied.' txt = sentence.format(integration_prog, get_inf_article(abstype), abstype, scale_prog) paragraph.add_run(retranslate_delimiter(txt)) ref.append([data_reduct_ref, absorpt_ref])
def add_line_break(paragraph: Paragraph, num: int, font_size=None) -> None: """Add a number of line breaks into the target `paragraph` object. Args: paragraph: Target paragraph. num: Number of line breaks. font_size: Font size of the line break. """ run = paragraph.add_run() if font_size: run.font.size = font_size for i in range(num): run.add_break()
def __init__(self, cif: CifContainer, paragraph: Paragraph, ref: ReferenceList): self.cif = cif refineref = DummyReference() solveref = DummyReference() solution_prog = gstr(self.cif['_computing_structure_solution']) or '??' solution_method = gstr( self.cif['_atom_sites_solution_primary']) or '??' if solution_prog.upper().startswith(('SHELXT', 'XT')): solveref = SHELXTReference() if 'SHELXS' in solution_prog.upper(): solveref = SHELXSReference() if 'SHELXD' in solution_prog.upper(): solveref = SHELXDReference() refined = gstr(self.cif['_computing_structure_refinement']) or '??' if refined.upper().startswith(('SHELXL', 'XL')): refineref = SHELXLReference() if 'OLEX' in refined.upper(): refineref = Olex2Reference() refine_coef = gstr(self.cif['_refine_ls_structure_factor_coef']) sentence = r"The structure were solved by {} methods using {} and refined by full-matrix " \ "least-squares methods against " txt = sentence.format(solution_method.strip('\n\r'), solution_prog.split()[0]) paragraph.add_run(retranslate_delimiter(txt)) paragraph.add_run('F').font.italic = True if refine_coef.lower() == 'fsqd': paragraph.add_run('2').font.superscript = True paragraph.add_run(' by {}'.format(refined.split()[0])) shelxle = None if 'shelxle' in refined.lower( ) or 'shelxle' in self.cif['_computing_molecular_graphics'].lower(): paragraph.add_run(' using ShelXle') shelxle = ShelXleReference() paragraph.add_run('.') ref.append([solveref, refineref, shelxle])
def __init__(self, cif: CifContainer, paragraph: Paragraph): self.cif = cif self.difftype = gstr(self.cif['_diffrn_measurement_device_type']) \ or '[No measurement device type given]' self.device = gstr(self.cif['_diffrn_measurement_device']) \ or '[No measurement device given]' self.source = gstr(self.cif['_diffrn_source']).strip('\n\r') \ or '[No radiation source given]' self.monochrom = gstr(self.cif['_diffrn_radiation_monochromator']) \ or '[No monochromator type given]' if not self.monochrom: self.monochrom = '?' self.cooling = gstr(self.cif['_olex2_diffrn_ambient_temperature_device']) \ or '' self.rad_type = gstr(self.cif['_diffrn_radiation_type']) \ or '[No radiation type given]' radtype = format_radiation(self.rad_type) self.wavelen = gstr(self.cif['_diffrn_radiation_wavelength']) \ or '[No wavelength given]' self.detector_type = '' detector_type = gstr(self.cif['_diffrn_detector_type']) \ or '[No detector type given]' if detector_type: self.detector_type = " and a {} detector".format(detector_type) sentence1 = "on {0} {1} {2} with {3} {4} using {5} as monochromator{6}. " \ "The diffractometer was equipped with {7} {8} low temperature device and used " sentence2 = " radiation (λ = {}" + protected_space + "{}). ".format( angstrom) txt = sentence1.format(get_inf_article(self.difftype), self.difftype, self.device, get_inf_article(self.source), self.source, self.monochrom, self.detector_type, get_inf_article(self.cooling), self.cooling) paragraph.add_run(retranslate_delimiter(txt)) # radiation type e.g. Mo: paragraph.add_run(retranslate_delimiter(radtype[0])) # K line: radrunita = paragraph.add_run(radtype[1]) radrunita.font.italic = True alpha = paragraph.add_run(retranslate_delimiter(radtype[2])) alpha.font.italic = True alpha.font.subscript = True txt2 = sentence2.format(self.wavelen) paragraph.add_run(txt2)
def __render_inline_element(self, p: Paragraph, pq: PyQuery, bold=False, italic=False, sub=False, sup=False, underline=False, font_size=None, strike=False): """ 渲染行内元素 :param p: 段落 :param pq: 带渲染元素 :param bold: 加粗 :param italic: 斜体 :param sub: 下标 :param sup: 上标 :param underline: 下划线 :param font_size:默认字号 9pt,小五号 :return: """ for item in pq.contents(): text = item.text if isinstance(item, (HtmlElement, _Element)) else item if isinstance(item, (HtmlElement, _Element)): self._render_element(p, item, bold=bold, italic=italic, underline=underline, strike=strike, sup=sup, sub=sub, font_size=font_size) continue run = p.add_run(text) self.__force_simsun(run) run.underline = underline run.bold = bold run.italic = italic run.font.superscript = sup run.font.subscript = sub if font_size: run.font.size = font_size run.font.strike = strike
def draw_two_multi_pron(paragraph: Paragraph, letter1: str, letter2: str, bold=False): draw_empty(paragraph) run6 = paragraph.add_run(letter1) run6.bold = bold paragraph.add_run(space * 4) run8 = paragraph.add_run(letter2) run8.bold = bold paragraph.add_run('\n') draw_empty(paragraph)
def add_table_of_contents(paragraph: Paragraph) -> None: """Add a table of contents to the paragraph.""" run = paragraph.add_run() fld_char = OxmlElement("w:fldChar") # creates a new element fld_char.set(qn("w:fldCharType"), "begin") # sets attribute on element instr_text = OxmlElement("w:instrText") instr_text.set(qn("xml:space"), "preserve") # sets attribute on element instr_text.text = 'TOC \\o "1-3" \\h \\z \\u' # change 1-3 depending on heading levels you need fld_char2 = OxmlElement("w:fldChar") fld_char2.set(qn("w:fldCharType"), "separate") fld_char3 = OxmlElement("w:t") fld_char3.text = "Right-click to update field." fld_char2.append(fld_char3) fld_char4 = OxmlElement("w:fldChar") fld_char4.set(qn("w:fldCharType"), "end") r_element = run._r # pylint: disable=protected-access r_element.append(fld_char) r_element.append(instr_text) r_element.append(fld_char2) r_element.append(fld_char4)
def render(self, p: Paragraph, _: docx.document.Document) -> None: font = p.add_run(self.text, self.style).font font.name = self.font_name font.size = Pt(self.font_size)
def _render_img(self, p: Paragraph, pq: PyQuery): """ 渲染图片 :param p: :param pq: :return: """ from django.conf import settings src = pq.attr('src') if src is None: return width = self._get_pq_style(pq, 'width') col1_width = Cm(self.content_side_width) if width: digit_array = re.findall(r'\d+(?:\.\d+)*', width) if len(digit_array): width = float(digit_array[0]) width = min(self.get_cm(int(width * IMG_SIZE_ZOOM_FACTOR)), col1_width) if src.startswith("http"): src = src[len(settings.MEDIA_URL):] elif src.startswith('/media/'): src = src[len('/media/'):] if src.startswith('/'): src = src[1:] target_file_name = default_storage.path(src) if not default_storage.exists(target_file_name): # target_file_name = default_storage.path('tmp/export/word/' + src[src.rindex('/') + 1:]) if src.startswith('data:image'): idx = src.index(',') stream = BytesIO(decode_base64(src[idx + 1:].encode('ascii'))) pic = p.add_run().add_picture(stream, width) self.adjust_pic_width(pic, col1_width) else: try: resp = requests.get(settings.MEDIA_URL + src, stream=True, timeout=1) if resp.status_code == 200: default_storage.save(target_file_name, resp.raw) target_file_name = self._convert_svg_to_jpg( target_file_name) pic = p.add_run().add_picture(target_file_name, width) # 设置图片大小 self.adjust_pic_width(pic, col1_width) else: p.add_run("MISS IMG") print(f"缺少图片:{src}") except RequestException: pass else: try: target_file_name = self._convert_svg_to_jpg(target_file_name) pic = p.add_run().add_picture(target_file_name, width) # 设置图片大小 self.adjust_pic_width(pic, col1_width) except UnrecognizedImageError: print(f"缺少图片:{src}") p.add_run("MISS IMG")
def _render_span(self, p: Paragraph, pq: PyQuery, bold=False, italic=False, strike=False, underline=False, font_size=None, sub=False, sup=False): """ 转换span change 19.5.3 公式转换错误,则直接用图片 :param pq: :return: """ try: if pq.attr('data-latex'): # 公式 omml_str = converter.to_omml( self.mini_trim(pq.attr('data-latex'))) omml_str = omml_str.replace( '<m:oMath', '<m:oMath xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math"' ) pq(p._element).append(omml_str) return if pq.has_class("math-tex"): # 公式 if pq.attr('data-latex'): omml_str = pq.attr('data-latex') else: omml_str = html.unescape( pq.html()) if pq.html() is not None else '' omml_str = omml_str.replace(r'\(', '').replace(r'\)', '') omml_str = converter.to_omml(self.mini_trim(omml_str)) omml_str = omml_str.replace( '<m:oMath', '<m:oMath xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math"' ) pq(p._element).append(omml_str) return # 阿凡题公式 if pq.has_class('afanti-latex'): metadata = AftQuestion(pq).parse_element() if metadata.startswith('^') or metadata.startswith('_'): last_ele = pq(p._element).children()[-1] metadata = last_ele.text[-1] + metadata last_ele.text = last_ele.text[:-1] omml_str = converter.to_omml(self.mini_trim(metadata)) omml_str = omml_str.replace( '<m:oMath', '<m:oMath xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math"' ) pq(p._element).append(omml_str) return except EquationConvertError: img = PyQuery('img', pq) self._render_img(p, img) return bold = any([ bold, self._get_pq_style(pq, 'font-weight') == 'bold', self._get_pq_style(pq, 'font-weight') == 'bolder' ]) italic = any( [italic, self._get_pq_style(pq, 'font-style') == 'italic']) strike = any([ strike, self._get_pq_style(pq, 'text-decoration') == 'line-through', self._get_pq_style(pq, 'text-decoration-line') == 'line-through' ]) underline = any([ underline, self._get_pq_style(pq, 'text-decoration') == 'underline', self._get_pq_style(pq, 'text-decoration-line') == 'underline' ]) if self._get_pq_style(pq, 'font-size'): size = self._get_pq_style(pq, 'font-size') if size.endswith('px'): size = size[:-2] size = int(float(size)) font_size = self.get_pt(size) elif size.endswith('pt'): size = size[:-2] size = float(size) font_size = Pt(size) # self.__render_inline_element(p, pq, bold=bold, italic=italic, underline=underline, font_size=font_size, # strike=strike) contents = pq.contents() for item in contents: if isinstance(item, (HtmlElement, _Element)): self._render_element(p, item, is_root=True, bold=bold, italic=italic, strike=strike, underline=underline, font_size=font_size) continue run = p.add_run(self._clear_text(item)) self.__force_simsun(run) if self._get_pq_style(pq, 'font-name'): run.font.name = self._get_pq_style(pq, 'font-name') if font_size: run.font.size = font_size run.underline = underline run.bold = bold run.italic = italic run.font.strike = strike run.font.superscript = sup run.font.subscript = sub
def _render_element(self, p: Paragraph, element: str or Element, is_root=False, bold=False, italic=False, strike=False, underline=False, font_size=None, sup=False, sub=False): """ 转换html节点到word :param element: :return: """ if isinstance(element, str): run = p.add_run(self._clear_text(element)) run.bold = bold run.italic = italic run.font.strike = strike run.font.underline = underline run.font.subscript = sub run.font.superscript = sup if font_size: run.font.size = font_size self.__force_simsun(run) return pq = PyQuery(element) if pq.is_('p'): # 不支持嵌套p,自动扁平化 contents = pq.contents() align = self._get_pq_style(pq, 'text-align') if align == 'center': p.alignment = WD_ALIGN_PARAGRAPH.CENTER elif align == 'right': p.alignment = WD_ALIGN_PARAGRAPH.RIGHT else: p.alignment = WD_ALIGN_PARAGRAPH.LEFT if is_root: self._render_children(p, contents) else: sub_p = p._parent.add_paragraph() if align == 'center': sub_p.alignment = WD_ALIGN_PARAGRAPH.CENTER elif align == 'right': sub_p.alignment = WD_ALIGN_PARAGRAPH.RIGHT else: sub_p.alignment = WD_ALIGN_PARAGRAPH.LEFT self._render_children(sub_p, contents) elif pq.is_('u'): # 下划线 self.__render_inline_element(p, pq, underline=True, bold=bold, italic=italic, strike=strike, font_size=font_size, sub=sub, sup=sup) elif pq.is_('strong') or pq.is_('b'): # 加粗 self.__render_inline_element(p, pq, underline=underline, bold=True, italic=italic, strike=strike, font_size=font_size, sub=sub, sup=sup) elif pq.is_('i') or pq.is_('em'): # 斜体 self.__render_inline_element(p, pq, underline=underline, bold=bold, italic=True, strike=strike, font_size=font_size, sub=sub, sup=sup) elif pq.is_('sub'): # 下标 self.__render_inline_element(p, pq, underline=underline, bold=bold, italic=italic, strike=strike, font_size=font_size, sub=True, sup=sup) elif pq.is_('sup'): # 上标 self.__render_inline_element(p, pq, underline=underline, bold=bold, italic=italic, strike=strike, font_size=font_size, sub=sub, sup=True) elif pq.is_('var'): # 老公式 self.__render_inline_element(p, pq, underline=underline, bold=bold, italic=True, strike=strike, font_size=font_size, sub=sub, sup=sup) elif pq.is_('span'): self._render_span(p, pq, bold=bold, italic=italic, strike=strike, underline=underline, font_size=font_size) elif pq.is_("br"): p.add_run().add_break() elif pq.is_("div"): # sub_p = p._parent.add_paragraph() p.add_run().add_break() self._render_children(p, pq.contents()) elif pq.is_('ul'): self._render_unorder_list(p, pq) elif pq.is_('ol'): self._render_order_list(p, pq) elif pq.is_('table'): self._render_table(p, pq) elif pq.is_('img'): # 图片 self._render_img(p, pq) elif element.tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'): sub_p = p._parent.add_paragraph() self.__render_inline_element(sub_p, pq, bold=True, font_size=Pt(12), underline=underline, italic=True, strike=strike, sub=sub, sup=sup) else: sub_p = p._parent.add_paragraph() contents = pq.contents() self._render_children(sub_p, contents)
def __init__(self, paragraph: Paragraph, ref: ReferenceList): sentence = "This report and the CIF file were generated using FinalCif." paragraph.add_run(sentence) ref.append(FinalCifReference())
def render(self, p: Paragraph, _: docx.document.Document) -> None: p.add_run(self.text, self.style).font.color.rgb = self.color
def render(self, p: Paragraph, _: docx.document.Document) -> None: p.add_run(self.text, self.style).bold = True
def __init__(self, cif: CifContainer, paragraph: Paragraph): """ TODO: check if the proposed things are really there. """ self.cif = cif n_isotropic = self.number_of_isotropic_atoms() number = 'All' parameter_type = 'anisotropic' if 0 < n_isotropic < self.cif.natoms(without_h=True): number = 'Some atoms ({}) were refined using isotropic displacement parameters.' \ ' All other'.format(n_isotropic) if n_isotropic > 0 and n_isotropic > self.cif.natoms(without_h=True): number = 'Most atoms ({}) were refined using isotropic displacement parameters.' \ ' All other'.format(n_isotropic) if n_isotropic == self.cif.natoms(without_h=True): number = 'All' parameter_type = 'isotropic' sentence1 = "{} non-hydrogen atoms were refined with {} displacement parameters. " \ "The hydrogen atoms were refined isotropically on calculated positions using a riding model " \ "with their ".format(number, parameter_type) sentence2 = " values constrained to 1.5 times the " sentence3 = " of their pivot atoms for terminal sp" sentence4 = " carbon atoms and 1.2 times for all other carbon atoms." paragraph.add_run(sentence1) paragraph.add_run('U').font.italic = True paragraph.add_run('iso').font.subscript = True paragraph.add_run(sentence2) paragraph.add_run('U').font.italic = True paragraph.add_run('eq').font.subscript = True paragraph.add_run(sentence3) paragraph.add_run('3').font.superscript = True paragraph.add_run(sentence4)
def convert(source_path, out_path, short_name, cite, year): ### TODO: # whitelist allowed tags # replace paragraph with .5 inch indented first line with a tab ### known changes: # tighter character spacing? # footnote numbers bold? # no space after footnote number? ### LOAD DATA ### # load docs source_doc, source_pq = load_doc(source_path) template_doc, template_pq = load_doc(template_path) # load footnotes footnotes_part, footnotes_el, footnotes_pq = load_part( source_doc.part.part_related_by(RT.FOOTNOTES)) template_footnotes_part, template_footnotes_el, template_footnotes_pq = load_part( template_doc.part.part_related_by(RT.FOOTNOTES)) ### COPY STYLES FROM TEMPLATE ### # copy styles, settings, and section formatting from template doc replace_element_contents(template_doc.styles._element, source_doc.styles._element) replace_element_contents(template_doc.settings._element, source_doc.settings._element) replace_element_contents( template_pq('w|sectPr')[0], source_pq('w|sectPr')[0]) replace_element_contents( template_footnotes_pq('w|footnote').children()[0], footnotes_pq('w|footnote').children() [0]) # first footnote is the footnote separator ### HEADERS ### # delete existing header parts and copy in new header parts for rId, rel in list(source_doc.part.rels.items()): if rel.reltype == RT.HEADER: del source_doc.part.rels[rId] update_refs = {} header_parts = [] for rId, rel in template_doc.part.rels.items(): if rel.reltype == RT.HEADER: new_id = source_doc.part.rels._next_rId update_refs[rId] = new_id header_parts.append(load_part(rel.target_part)) source_doc.part.rels.add_relationship(RT.HEADER, rel.target_part, new_id) source_doc.part.package.parts.append(rel.target_part) # update header references for header_ref in source_pq('w|headerReference'): header_ref.attrib[qn('r:id')] = update_refs[header_ref.attrib[qn( 'r:id')]] # fill in header values for header_part, header_el, header_pq in header_parts: header_pq("w|rStyle[w|val='HeaderYear']").closest('w|r')('w|t').text( year) header_pq("w|rStyle[w|val='HeaderCitation']").closest('w|r')( 'w|t').text(cite) short_name_par = Paragraph( header_pq("w|pStyle[w|val='HeaderCaseName']").closest('w|p')[0], None) short_name_par.clear() # italicize v. in party name if ' v. ' in short_name: party_a, party_b = short_name.split(' v. ', 2) short_name_par.add_run(party_a) vs_run = short_name_par.add_run(' v. ') vs_run.italic = True short_name_par.add_run(party_b) else: short_name_par.add_run(short_name) # set starting page number starting_page_number = cite.rsplit(' ', 1)[-1] source_pq('w|sectPr').append( make_el( source_pq('w|sectPr')[0], 'w:pgNumType', {'w:start': starting_page_number})) ### TYPOGRAPHY ### # apply typography changes to body text and footnotes, adjusting variables that are different for query, allowed_styles, section_name, blockquote_style_name in ( (source_pq, ('FootnoteReference', ), 'body', 'Blockquote'), (footnotes_pq, ('FootnoteText', 'FootnoteSeparator', 'FootnoteReference'), 'footnote', 'FootnoteBlockquote')): # clear existing styles ignore_removed_styles = ('NormalWeb', ) for style_tag in query('w|pStyle,w|rStyle'): style_name = style_tag.attrib.get(qn('w:val')) if style_name not in allowed_styles: if style_name not in ignore_removed_styles: print("Warning: removing unrecognized %s style %s." % (section_name, style_name)) remove_el(style_tag) # mark block quotes for par in query('w|ind[w|left="720"]'): if qn('w:hanging') not in par.attrib: par = pq(par).closest('w|p')[0] par.style = blockquote_style_name # remove fonts and sizes remove_tags = ('sz', 'szCs', 'rFonts', 'ind', 'spacing', 'proofErr', 'bookmarkStart', 'bookmarkEnd', 'color[w|val="000000"]', 'lastRenderedPageBreak') for tag in remove_tags: query('w|' + tag).remove() # underline to italic for el in query('w|u'): if el.attrib.get(qn('w:val')) == 'double': el.tag = qn('w:smallCaps') else: el.tag = qn('w:i') el.attrib.clear() # combine consecutive runs with identical formatting query('w|t').attr( qn('xml:space'), 'preserve') # add preserve to all t blocks for uniformity skip = 0 for run in query('w|r'): # skip runs that have already been appended to previous run and detached if skip: skip -= 1 continue blank_r1 = blank_run(run) while True: r2 = pq(run).next() if not r2: break r2 = r2[0] if r2.tag != run.tag or etree.tostring( blank_r1) != etree.tostring(blank_run(r2)): break run.text += r2.text remove_el(r2) skip += 1 # text replacements for t in query('w|t'): text = t.text # fix dashes text = text.replace(" -- ", " — ") # remove double spaces text = re.sub(' +', ' ', text) # fix quotes for straight_quote, left_quote, right_quote in (('"', '“', '”'), ("'", '‘', '’')): if straight_quote not in text: continue # right smart quotes text = re.sub(r'([a-zA-Z0-9.,?!;:\'\"])%s' % straight_quote, r'\1%s' % right_quote, text) text = re.sub(r'%s ' % straight_quote, r'%s ' % right_quote, text) # remaining are left smart quotes text = text.replace(straight_quote, left_quote) t.text = text ### FOOTNOTES ### footnote_tab = deepcopy( template_footnotes_pq('w|footnote:not([w|type]) w|r') [0]) # first run in template footnotes is a tab for footnote in footnotes_pq('w|footnote:not([w|type])'): # remove extra tabs from footnotes, add single tab for run in pq(footnote, namespaces=nsmap)('w|r'): if pq(run, namespaces=nsmap)('w|tab'): remove_el(run) else: pq(run).before(deepcopy(footnote_tab)) break # make sure footnotes have FootnoteText style for par in pq(footnote, namespaces=nsmap)('w|p'): if not par.style: par.style = 'FootnoteText' ### CAPTION ### def skip_blanks(paragraphs, par_num): par_num += 1 while not has_text(paragraphs[par_num]): par_num += 1 return par_num # delete first four paragraphs pq(source_pq('w|p')[:4]).remove() paragraphs = source_pq('w|p') # format first paragraph par_num = 0 paragraphs[par_num].style = 'CaseName' # process the case name so all-caps becomes small-caps: for run in pq(paragraphs[par_num])('w|r'): parts = re.split(r'([A-Z][A-Z]+)', run.text) if len(parts) > 1: new_runs = split_run(run, parts) for new_run in new_runs[1::2]: # every other part will be all-caps, so should become small-caps Run(new_run, None).font.small_caps = True new_run.text = new_run.text.title() par_num = skip_blanks(paragraphs, par_num) paragraphs[par_num].style = 'Dates' par_num = skip_blanks(paragraphs, par_num) paragraphs[par_num].style = 'Judges' par_num = skip_blanks(paragraphs, par_num) paragraphs[par_num].style = 'Categories' par_num = skip_blanks(paragraphs, par_num) while has_text(paragraphs[par_num]): paragraphs[par_num].style = 'Headnote' par_num += 2 # extra space for last headnote Paragraph(paragraphs[par_num - 2], None).paragraph_format.space_after = Pt(12) par_num = skip_blanks(paragraphs, par_num) while has_text(paragraphs[par_num]): paragraphs[par_num].style = 'History' par_num += 2 par_num = skip_blanks(paragraphs, par_num) while has_text(paragraphs[par_num]): paragraphs[par_num].style = 'Appearance' par_num += 1 # mark author name -- first sentence of first paragraph of case text par_num = skip_blanks(paragraphs, par_num) first_paragraph = Paragraph(paragraphs[par_num], source_doc._body) try: first_run = next(r for r in first_paragraph.runs if r.text.strip()) first_run, second_run = split_run(first_run._element, first_run.text.split('.', 1)) first_run.text = first_run.text.title() + "." Run(first_run, first_paragraph).style = "Author" except Exception as e: print("Warning: failed to detect author name. Searched this text: %s" % first_paragraph.text) raise # remove blank paragraphs # this has to come AFTER caption processing so we can tell sections apart for query in (source_pq, footnotes_pq('w|footnote:not([w|type])')): for p in query('w|p'): if not has_text(p): remove_el(p) ### HEADNOTE PAGE RANGES ### # replace highlighted headnote markers with bookmarks bookmarks = [] for i, highlight_run in enumerate( source_pq("w|highlight[w|val='yellow']")): highlight_run = pq(highlight_run).closest('w|r') bookmark_name = "Headnote%s%s" % ("End" if i % 2 else "Start", int(i / 2)) highlight_run.after( pq([ make_el(highlight_run[0], "w:bookmarkStart", { "w:id": str(i), "w:name": bookmark_name }), make_el(highlight_run[0], "w:bookmarkEnd", {"w:id": str(i)}) ])) remove_el(highlight_run[0]) bookmarks.append(bookmark_name) # replace headnote page number references with bookmark shortcodes reference_template = """ <w:fldSimple w:instr=" PAGEREF {bookmark_start} "> <w:r><w:rPr><w:noProof/></w:rPr><w:t>PRINT</w:t></w:r> </w:fldSimple> <w:r><w:fldChar w:fldCharType="begin"/></w:r> <w:r><w:instrText xml:space="preserve"> IF </w:instrText></w:r> <w:r><w:fldChar w:fldCharType="begin"/></w:r> <w:r><w:instrText xml:space="preserve"> PAGEREF {bookmark_start} </w:instrText></w:r> <w:r><w:fldChar w:fldCharType="separate"/></w:r> <w:r><w:rPr><w:noProof/></w:rPr><w:instrText>PRINT</w:instrText></w:r> <w:r><w:fldChar w:fldCharType="end"/></w:r> <w:r><w:instrText xml:space="preserve"> = </w:instrText></w:r> <w:r><w:fldChar w:fldCharType="begin"/></w:r> <w:r><w:instrText xml:space="preserve"> PAGEREF {bookmark_end} </w:instrText></w:r> <w:r><w:fldChar w:fldCharType="separate"/></w:r> <w:r><w:rPr><w:noProof/></w:rPr><w:instrText>PRINT</w:instrText></w:r> <w:r><w:fldChar w:fldCharType="end"/></w:r> <w:r><w:instrText xml:space="preserve"> "" "-</w:instrText></w:r> <w:r><w:fldChar w:fldCharType="begin"/></w:r> <w:r><w:instrText xml:space="preserve"> PAGEREF {bookmark_end} </w:instrText></w:r> <w:r><w:fldChar w:fldCharType="separate"/></w:r> <w:r><w:rPr><w:noProof/></w:rPr><w:instrText>PRINT</w:instrText></w:r> <w:r><w:fldChar w:fldCharType="end"/></w:r> <w:r><w:instrText>"</w:instrText></w:r> <w:r><w:fldChar w:fldCharType="end"/></w:r> """ for headnote in source_pq('w|pStyle[w|val="Headnote"]'): for run in pq(headnote).closest('w|p')('w|r'): run = pq(run) parts = re.split(r'\[.*?\]', run('w|t').text()) if len(parts) > 1: new_els = [] for i, part in enumerate(parts): if i != 0: new_els.extend( parse_xml_fragment( run[0], reference_template.format( bookmark_start=bookmarks.pop(0), bookmark_end=bookmarks.pop(0)))) new_run = deepcopy(run[0]) pq(new_run)('w|t').text(("]" if i != 0 else "") + part + ( "[" if i != len(parts) - 1 else "")) new_els.append(new_run) run.after(pq(new_els)) remove_el(run[0]) ### OUTPUT ### # write footnotes and headers save_part(footnotes_el, footnotes_part) for header_part, header_el, header_pq in header_parts: save_part(header_el, header_part) # save output #save_xml(out_path, source_doc) source_doc.save(out_path)
def render(self, p: Paragraph, _: docx.document.Document) -> None: if self.ref: p.add_run(self.ref(self.key))
def add_reference(self, p: Paragraph): if self.authors: p.add_run(self.authors) p.add_run(', ') if self.journal: p.add_run(self.journal).italic = True if not self.journal.endswith('.'): p.add_run(', ') else: p.add_run(' ') if self.year: p.add_run(self.year).bold = True p.add_run(', ') if self.volume: p.add_run(self.volume).italic = True p.add_run(', ') if self.pages: p.add_run(self.pages) if self.doi: p.add_run(', ') if self.doi: p.add_run(self.doi) if any([self.journal, self.pages, self.year, self.volume, self.doi]): p.add_run('.')
def text_set_fixture(self): paragraph = Paragraph(element('w:p'), None) paragraph.add_run('must not appear in result') new_text_value = 'foo\tbar\rbaz\n' expected_text_value = 'foo\tbar\nbaz\n' return paragraph, new_text_value, expected_text_value