def process(oldnode, informat): if oldnode.tag in [ 'url', 'href', 'mrf', 'doi', 'bibtype', 'bibkey', 'revision', 'erratum', 'attachment', 'paper', 'presentation', 'dataset', 'software', 'video' ]: return elif oldnode.tag in ['author', 'editor']: for oldchild in oldnode: process(oldchild, informat=informat) else: if informat == "latex": if len(oldnode) > 0: logging.error("field has child elements {}".format(', '.join( child.tag for child in oldnode))) oldtext = ''.join(oldnode.itertext()) newnode = latex_to_xml(oldtext, trivial_math=True, fixed_case=True) newnode.tag = oldnode.tag newnode.attrib.update(oldnode.attrib) replace_node(oldnode, newnode) maptext(oldnode, html.unescape) maptext(oldnode, curly_quotes) maptext(oldnode, clean_unicode) if oldnode.tag in ['title', 'booktitle']: protect(oldnode)
def normalize(oldnode, informat): """ Receives an XML 'paper' node and normalizes many of its fields, including: - Unescaping HTML - Normalizing quotes and other punctuation - Mapping many characters to unicode In addition, if the 'informat' is "latex", it will convert many LaTeX characters to unicode equivalents. Note that these latter LaTeX operations are not idempotent. """ if oldnode.tag in [ "url", "href", "mrf", "doi", "bibtype", "bibkey", "revision", "erratum", "attachment", "paper", "presentation", "dataset", "software", "video", ]: return elif oldnode.tag in ["author", "editor"]: for oldchild in oldnode: normalize(oldchild, informat=informat) else: if informat == "latex": if len(oldnode) > 0: logging.error("field has child elements {}".format(", ".join( child.tag for child in oldnode))) oldtext = "".join(oldnode.itertext()) newnode = latex_to_xml( oldtext, trivial_math=True, fixed_case=oldnode.tag in ["title", "booktitle"], ) newnode.tag = oldnode.tag newnode.attrib.update(oldnode.attrib) replace_node(oldnode, newnode) maptext(oldnode, html.unescape) maptext(oldnode, curly_quotes) maptext(oldnode, clean_unicode) if oldnode.tag in ["title", "booktitle"]: protect(oldnode)