def compare_dirs(one, two): '''Compares two directories, ignoring whitespace. Replaces the tei:text and ge:document elements files in one by those of files in two if equal''' for dir in os.walk(one): for file in dir[2]: if file.endswith('.xml'): path_1 = os.path.join(dir[0], file) relpath = os.path.relpath(path_1, one) path_2 = os.path.join(two, relpath) print(relpath, '\t', end='') if os.path.isfile(path_2): try: xml_1 = lxml.etree.parse(path_1) xml_2 = lxml.etree.parse(path_2) #remove templates from new xml templates = template_xp(xml_1) if templates: templates[0].getparent().remove(templates[0]) print("tei:text ", end='') txt_equal = compare_elements(txt_xp(xml_1), txt_xp(xml_2)) print("\t", end=''); print("ge:document ", end='') doc_equal = compare_elements(doc_xp(xml_1), doc_xp(xml_2)) print("\t", end='') if replace_new: if txt_equal: replace(txt_xp(xml_1)[0], txt_xp(xml_2)[0]) if doc_equal: replace(doc_xp(xml_1)[0], doc_xp(xml_2)[0]) if templates: body_xp(xml_1)[0].append(copy.deepcopy(templates[0])) if txt_equal or doc_equal: rev_desc.add_change(xml_1, "system", "whitespace-restored") faust.tei_serialize(xml_1).write(path_1, encoding="UTF-8") if templates: print(" *t* ", end='') if txt_equal or doc_equal: print(" ***MODIFIED***", end='') except lxml.etree.XMLSyntaxError: print("XML syntax error", end='') except ValueError as e: print(e, end=''); else: print ("not in dir2", end='') print()
def compare_dirs(one, two): """Compares two directories, ignoring whitespace. Replaces the tei:text and ge:document elements files in one by those of files in two if equal""" for dir in os.walk(one): for file in dir[2]: if file.endswith(".xml"): path_1 = os.path.join(dir[0], file) relpath = os.path.relpath(path_1, one) path_2 = os.path.join(two, relpath) print(relpath, "\t", end="") if os.path.isfile(path_2): try: xml_1 = lxml.etree.parse(path_1) xml_2 = lxml.etree.parse(path_2) # remove templates from new xml templates = template_xp(xml_1) if templates: templates[0].getparent().remove(templates[0]) print("tei:text ", end="") txt_equal = compare_elements(txt_xp(xml_1), txt_xp(xml_2)) print("\t", end="") print("ge:document ", end="") doc_equal = compare_elements(doc_xp(xml_1), doc_xp(xml_2)) print("\t", end="") if replace_new: if txt_equal: replace(txt_xp(xml_1)[0], txt_xp(xml_2)[0]) if doc_equal: replace(doc_xp(xml_1)[0], doc_xp(xml_2)[0]) if templates: body_xp(xml_1)[0].append(copy.deepcopy(templates[0])) if txt_equal or doc_equal: rev_desc.add_change(xml_1, "system", "whitespace-restored") faust.tei_serialize(xml_1).write(path_1, encoding="UTF-8") if templates: print(" *t* ", end="") if txt_equal or doc_equal: print(" ***MODIFIED***", end="") except lxml.etree.XMLSyntaxError: print("XML syntax error", end="") except ValueError as e: print(e, end="") else: print("not in dir2", end="") print()
def tei_transform (tei_file, transform_etree): try: if not faust.is_tei_document(tei_file): sys.stderr.write("Not a TEI file: " + file + "\n") return xml = lxml.etree.parse(tei_file) result = transform_etree(xml) faust.tei_serialize(result).write(tei_file, encoding="UTF-8") except IOError: sys.stderr.write("I/O error while transforming " + tei_file + "\n") except lxml.etree.XMLSyntaxError: sys.stderr.write("XML syntax error while transforming " + tei_file + "\n")
def correct_graphic_uris(): # take into account old GSA files files = [f for f in faust.transcript_files() if '/gsa/' in f] files.extend(xml_names_from_facsimiles()) for f in files: rewrite_file = False try: xml = lxml.etree.parse(f) except IOError: # these should only be GSA files print "(", f, " doesn't exist)" continue print f graphics = graphic_xp(xml) if len(graphics) == 0: append_facsimile_element(xml) # find the newly appended element graphics = graphic_xp(xml) brutal = False if len(graphics) == 1: brutal = True for graphic in graphics: old = graphic.attrib["url"] new = correct_uri(old, brutal, f) graphic.attrib["url"] = new if new != old: print " correcting: ", old, " -> ", new rewrite_file = True if rewrite_file: rev_desc.add_change(xml, "system", "facsimile_adapted") print " writing" faust.tei_serialize(xml).write(f, encoding='UTF-8') else: print " not writing"
def make_template(path): print "creating: ", path faust.tei_serialize(doc_template).write(path, encoding='UTF-8')
# Get the template and parse it tei_template = faust.absolute_path("template/tei.xml") template = lxml.etree.parse(tei_template) # extract relevant header fragments from template template_hand_notes = handNotes_xp(template)[0] template_char_decl = charDecl_xp(template)[0] def replace(node, with_node): '''Replaces a node with a deep copy of a node (from another document)''' node.getparent().replace(node, copy.deepcopy(with_node)) # iterate over TEI files (excluding the template) for xml_file in faust.xml_files(): try: if (xml_file != tei_template) and faust.is_tei_document(xml_file): xml = lxml.etree.parse(xml_file) # replace header fragments for hand_notes in handNotes_xp(xml): replace(hand_notes, template_hand_notes) for char_decl in charDecl_xp(xml): replace(char_decl, template_char_decl) # write back updated document faust.tei_serialize(xml).write(xml_file, encoding="UTF-8") except IOError: sys.stderr.write("I/O error while updating " + xml_file + "\n") except lxml.etree.XMLSyntaxError: sys.stderr.write("XML syntax error while updating " + xml_file + "\n")
def make_template(path): print "creating: " , path faust.tei_serialize(doc_template).write(path, encoding='UTF-8')