def to_convert(): text_content_xp = faust.xpath("normalize-space(//tei:text)") ge_document_content_xp = faust.xpath("normalize-space(//ge:document)") has_text_in = lambda xp, xml: (len(" ".join(xp(xml)).strip()) > 0) to_convert = list() for xml_file in faust.xml_files(): path = faust.relative_path(xml_file).split("/") if path[0] != "transcript": continue file_name = path[-1] if file_name[:-len(".xml")] == path[-2]: continue if int(re.search(r'[0-9]+', file_name).group(0)) == 1: continue if not faust.is_tei_document(xml_file): continue xml = lxml.etree.parse(xml_file) if has_text_in(text_content_xp, xml) and not has_text_in(ge_document_content_xp, xml): to_convert.append(xml_file) return to_convert
def unique_values(files, xpath): ''' List all unique values for matches of an xpath.''' unique = set() for f in files: try: xml = lxml.etree.parse(f) results = [result.tag for result in faust.xpath(xpath)(xml)] unique = unique.union(results) except lxml.etree.XMLSyntaxError: sys.stderr.write("XML syntax error: " + f + "\n") return unique
def unique_values(files, xpath): ''' List all unique values for matches of an xpath.''' unique = set() for f in files: try: xml = lxml.etree.parse(f) results = faust.xpath(xpath)(xml) unique = unique.union(results) except lxml.etree.XMLSyntaxError: sys.stderr.write("XML syntax error: " + f + "\n") return unique
def unique_values(files, xpath): ''' List all unique values for matches of an xpath.''' unique = set() def to_str(val): try: return val.tag except AttributeError: return str(val) for f in files: try: xml = lxml.etree.parse(f) results = [to_str(result) for result in faust.xpath(xpath)(xml)] unique = unique.union(results) except lxml.etree.XMLSyntaxError as e: sys.stderr.write("XML syntax error: " + f + "\n") sys.stderr.write(str(e)) return unique
def does_match (file): try: xml = lxml.etree.parse(file) return faust.xpath(xpath)(xml) except lxml.etree.XMLSyntaxError: sys.stderr.write("XML syntax error: " + file + "\n")
last = None pages.sort() for p in pages: p_xml = lxml.etree.Element(faust_ns + "materialUnit") p_xml.set("type", "page") p_xml.set("transcript", p) if last is None: document_xml.insert(0, p_xml) else: last.addnext(p_xml) last = p_xml metadata_xml = lxml.etree.Element(faust_ns + "metadata") document_xml.insert(0, metadata_xml) lxml.etree.SubElement(metadata_xml, faust_ns + "archive").text = "gsa" callnumber = faust.xpath("//f:signatur/text()", document_xml)[0] if callnumber in gsa_callnumber_mapping: callnumber = gsa_callnumber_mapping[callnumber] + " (" + callnumber + ")" lxml.etree.SubElement(metadata_xml, faust_ns + "callnumber").text = callnumber wa_id_matches = faust.xpath("//f:key[@n='25']/following::f:value", document_xml) if (len(wa_id_matches) > 0): wa_id = wa_id_matches[0].text if wa_id != "-" and wa_id != "oS": lxml.etree.SubElement(metadata_xml, faust_ns + "waId").text = wa_id xml_dir = faust.absolute_path("/".join(("document", ) + documents_struct[gsa_ident][0])) if not os.path.isdir(xml_dir): os.makedirs(xml_dir) document_xml.getroottree().write("/".join((xml_dir, "gsa_" + gsa_ident + ".xml")), encoding="UTF-8", pretty_print=True)
# Search for missing @xml:space import lxml.etree import faust ignored_tags = ( "app", "back", "body", "choice", "div", "docTitle", "fix", "front", "fw", "g", "group", "lg", "overw", "patch", "sp", "subst", "surface", "text", "titlePage", "titlePart", "used", "zone") ignored_empty_elems = ( "addSpan", "anchor", "cb", "certainty", "damageSpan", "delSpan", "gap", "grBrace", "grLine", "handShift", "ins", "join", "lb", "pb", "space", "st", "undo", "p") element_selector_xp = faust.xpath("//*[(ancestor::tei:text or ancestor::ge:document) and not(@xml:space)]") text_content_xp = faust.xpath("normalize-space()") candidates = dict() for xml_file in faust.xml_files(): try: if faust.is_tei_document(xml_file): xml = lxml.etree.parse(xml_file) xml_key = faust.relative_path(xml_file) candidates[xml_key] = [] for elem in element_selector_xp(xml): if elem.tag.startswith(faust.ns("svg")): continue local_name = elem.tag[elem.tag.rfind("}") + 1:]
#!/usr/bin/env python # coding=UTF-8 # # Correct the links to facsimile files # import faust import transform import lxml import os import sys import rev_desc doc_template = lxml.etree.parse( os.path.join(faust.xml_dir, "template", "tei.xml")) graphic_xp = faust.xpath("//tei:facsimile/tei:graphic") header_xp = faust.xpath("/tei:TEI/tei:teiHeader") valid_graphic_uris = faust.facsimiles() def xml_names_from_facsimiles(): prefix_length = len(faust.faust_scheme + "://facsimile/") def to_xml_path(facs_uri): stripped = facs_uri[prefix_length:] return os.path.join(faust.xml_dir, "transcript", stripped + ".xml") return map(to_xml_path, faust.facsimiles()) def facs_uri_from_xml(path): stripped = path[len(faust.xml_dir + "/facsimile/"):-len(".xml")]
def facs_invalid(file): xml = lxml.etree.parse(file) urls = faust.xpath("//tei:facsimile/tei:graphic/@url")(xml) for url in urls: if url in faust_facsimiles: return True return False
#!/usr/bin/env python # # Report on the transcript status as specified under # # https://faustedition.uni-wuerzburg.de/wiki/index.php/Stand_der_Transkription # import sys import lxml.etree import faust # XPath expression for extracting the revision history from TEI documents ge_doc_xp = faust.xpath("normalize-space(//ge:document)") # XPath expression for extracting the revision history from TEI documents change_xp = faust.xpath("//tei:teiHeader//tei:revisionDesc/tei:change") # iterate over all TEI documents for xml_file in faust.xml_files(): status = None try: if faust.is_tei_document(xml_file): xml = lxml.etree.parse(xml_file) if len(ge_doc_xp(xml).strip()) == 0: continue encoded = False for change in change_xp(xml): change_str = lxml.etree.tostring(change).lower() if "encoded" in change_str: encoded = True
#!/usr/bin/env python # coding=UTF-8 # # Correct the links to facsimile files # import faust import transform import lxml import os import sys import rev_desc doc_template = lxml.etree.parse(os.path.join(faust.xml_dir, "template", "tei.xml")) graphic_xp = faust.xpath ("//tei:facsimile/tei:graphic") header_xp = faust.xpath ("/tei:TEI/tei:teiHeader") valid_graphic_uris = faust.facsimiles() def xml_names_from_facsimiles(): prefix_length = len(faust.faust_scheme + "://facsimile/") def to_xml_path (facs_uri): stripped = facs_uri[prefix_length:] return os.path.join(faust.xml_dir, "transcript" , stripped + ".xml") return map (to_xml_path, faust.facsimiles()) def facs_uri_from_xml(path): stripped = path[len(faust.xml_dir + "/facsimile/") : - len(".xml")] return faust.faust_scheme + "://facsimile" + stripped def make_xml_templates(): xml_templates = xml_names_from_facsimiles() # check if all directories exist
# Search for missing @xml:space import lxml.etree import faust ignored_tags = ("app", "back", "body", "choice", "div", "docTitle", "fix", "front", "fw", "g", "group", "lg", "overw", "patch", "sp", "subst", "surface", "text", "titlePage", "titlePart", "used", "zone") ignored_empty_elems = ("addSpan", "anchor", "cb", "certainty", "damageSpan", "delSpan", "gap", "grBrace", "grLine", "handShift", "ins", "join", "lb", "pb", "space", "st", "undo", "p") element_selector_xp = faust.xpath( "//*[(ancestor::tei:text or ancestor::ge:document) and not(@xml:space)]") text_content_xp = faust.xpath("normalize-space()") candidates = dict() for xml_file in faust.xml_files(): try: if faust.is_tei_document(xml_file): xml = lxml.etree.parse(xml_file) xml_key = faust.relative_path(xml_file) candidates[xml_key] = [] for elem in element_selector_xp(xml): if elem.tag.startswith(faust.ns("svg")): continue local_name = elem.tag[elem.tag.rfind("}") + 1:]
#!/usr/bin/env python # # Report on the transcript status as specified under # # https://faustedition.uni-wuerzburg.de/wiki/index.php/Stand_der_Transkription # import sys import lxml.etree import faust # XPath expression for extracting the revision history from TEI documents change_xp = faust.xpath("//tei:teiHeader//tei:revisionDesc/tei:change") def count(): # status counters status_keys = [key for (key, value) in faust.config.items("log-status")] status_dict = {} for key in status_keys: status_dict[key] = 0 status_keys.sort() status_unknown = 0 # iterate over all TEI documents for xml_file in faust.transcript_files(): status = set()
#!/usr/bin/env python # coding=UTF-8 # # Add a revision description change to an xml file represented by an ElementTree import lxml.etree import faust import datetime revdesc_xp = faust.xpath("//tei:teiHeader/tei:revisionDesc") header_xp = faust.xpath("//tei:teiHeader") def add_change(xml, who, content, when=datetime.date.today().isoformat()): '''Adds a change element to the revisionDesc in the TEI header''' header = header_xp(xml) if not header: raise ValueError("No TEI header present") # if there is no tei:revisionDesc element, insert one if not revdesc_xp(xml): rev_desc_element = lxml.etree.Element(faust.ns("tei") + "revisionDesc") # revisionDesc always goes to the end of the header header[0].append(rev_desc_element) # build change element attribs = {"when": when, "who": who} change = lxml.etree.Element(faust.ns("tei") + "change", attribs) change.text = content
def matches_in_file (file): try: xml = lxml.etree.parse(file) return [file, faust.xpath(xpath)(xml)] except lxml.etree.XMLSyntaxError: sys.stderr.write("XML syntax error: " + file + "\n")
def convert(): tei_text_xp = faust.xpath("//tei:text") xml_id_cnt = 0 for xml_file in [faust.absolute_path(rel) for rel in static_to_convert()]: print xml_file xml = lxml.etree.parse(xml_file) # ***** Documentary transcript ***** # prepare <ge:document/> context root = lxml.etree.Element(faust.ns("ge") + "document", nsmap=faust.namespaces) root.set(faust.ns("xml") + "id", "converted") for child in xml.getroot(): if child.tag not in (faust.ns("tei") + "teiHeader", faust.ns("tei") + "facsimile"): child.addprevious(root) break surface = lxml.etree.SubElement(root, faust.ns("tei") + "surface") for body in faust.xpath(".//tei:body", xml): body_zone = copy.deepcopy(body) body_zone.tag = faust.ns("tei") + "zone" surface.append(body_zone) root = surface # let <add/>/<del/> inherit @hand from <subst/>/<restore/> for container_with_hand in faust.xpath(".//tei:subst[@hand]|./tei:restore[@hand]", root): hand = container_with_hand.get("hand") for add_xml in faust.xpath("./tei:add[count(@hand) = 0]", container_with_hand): add_xml.set("hand", hand) for del_xml in faust.xpath("./tei:del[count(@hand) = 0]", container_with_hand): del_xml.set("hand", hand) del container_with_hand.attrib["hand"] # convert @hand into <handShift/> for hand_annotated in faust.xpath(".//*[@hand]", root): if hand_annotated.tag not in (faust.ns("tei") + "add", faust.ns("tei") + "fw"): continue handShifts = faust.xpath("./preceding::tei:handShift", hand_annotated) last_hand = (len(handShifts) > 0) and handShifts[-1].get("new") or "#i_have_no_idea" # start of new hand hs = lxml.etree.Element(faust.ns("tei") + "handShift") hs.set("new", hand_annotated.get("hand")) hs.tail = hand_annotated.text hand_annotated.text = None hand_annotated.insert(0, hs) # reset to last hand hs = lxml.etree.Element(faust.ns("tei") + "handShift") hs.set("new", last_hand) hand_annotated.append(hs) del hand_annotated.attrib["hand"] # convert <div/> with @type == "zone" for div in root.iter(faust.ns("tei") + "div"): if "zone" == div.get("type", ""): div.tag = faust.ns("tei") + "zone" del div.attrib["type"] # convert overwritten parts for subst in root.iter(faust.ns("tei") + "subst"): att_vals = ["overwrite", "overwriting", "overwritiung"] def type_overwr_in_attributes(element): return element.get("type", "") in att_vals def rend_overwr_in_attributes(element): return element.get("rend", "") in att_vals children_with_type = filter(type_overwr_in_attributes, subst) children_with_rend = filter(rend_overwr_in_attributes, subst) # type attribute in substitution if type_overwr_in_attributes(subst): del subst.attrib["type"] # rend attribute in substitution elif rend_overwr_in_attributes(subst): del subst.attrib["rend"] # type attribute in a child (in add or del) elif children_with_type: for child in children_with_type: del child.attrib["type"] # rend attribute in a child elif children_with_rend: for child in children_with_rend: del child.attrib["rend"] else: continue subst.tag = faust.ns("f") + "overw" for del_xml in subst.findall(faust.ns("tei") + "del"): del_xml.tag = faust.ns("f") + "under" for add in subst.findall(faust.ns("tei") + "add"): add.tag = faust.ns("f") + "over" # <div type="cleared"/> becomes <ge:used spanTo="#..."> for div in root.iter(faust.ns("tei") + "div"): if "type" in div.attrib: if div.attrib["type"] == "cleared": used = lxml.etree.Element(faust.ns("ge") + "used") div.addprevious(used) xml_id_cnt += 1 anchor_id = str(xml_id_cnt) used.set("spanTo", "#" + anchor_id) for child in div.getchildren(): div.remove(child) div.addprevious(child) div.tag = faust.ns("tei") + "anchor" div.set(faust.ns("xml") + "id", anchor_id) # throw away text structure tagging lxml.etree.strip_tags(root,\ faust.ns("tei") + "div", faust.ns("tei") + "lg",\ faust.ns("tei") + "sp", faust.ns("tei") + "subst",\ faust.ns("tei") + "name", faust.ns("tei") + "addSpan") # remove Schroer numbers for l in root.iter(faust.ns("tei") + "l"): if "n" in l.attrib: del l.attrib["n"] # create simple lines for line_element in ("speaker", "l", "p", "stage", "head", "ab"): line_elements = list(root.iter(faust.ns("tei") + line_element)) for le in line_elements: if le.get("rend", "") in ["underline", "underlined", "centered unterline"]: hi = copy.deepcopy(le) hi.tag = faust.ns("tei") + "hi" le.clear() for attr in list(hi.attrib.keys()): if attr == "rend": continue le.set(attr, hi.get(attr)) del hi.attrib[attr] le.append(hi) le.tag = faust.ns("ge") + "line" # turn deletions into <f:st/> by default for del_xml in root.iter(faust.ns("tei") + "del"): del_xml.tag = faust.ns("f") + "st" del_type = del_xml.get("rend", "") if del_type == "strikethrough" or del_type == "strikedthrough": del del_xml.attrib["rend"] # rename tags for fixations for rewrite_tag in ("fix", "repetition"): for rewrite in root.iter(faust.ns("tei") + rewrite_tag): rewrite.tag = faust.ns("ge") + "rewrite" # rename semantic tags with @rend="underline" for sem_hi_tag in ("emph", "name"): for sem_hi in root.iter(faust.ns("tei") + sem_hi_tag): if sem_hi.get("rend", "") == "underline": sem_hi.tag = faust.ns("tei") + "hi" # convert umlaut corrections umlaut_mapping = { u"ä":u"a", u"Ä":u"A", u"ö":u"o", u"Ö":u"O", u"ü":u"u", u"Ü":u"U" } corr_or_reg = itertools.chain(root.iter(faust.ns("tei") + "corr"), root.iter(faust.ns("tei") + "reg")) for element in corr_or_reg: for umlaut in umlaut_mapping: if element.text == umlaut: element.text = umlaut_mapping[umlaut] element.tag = faust.ns("tei") + "orig" # join lines with @rend='inline' for inline_line in list(faust.xpath(".//ge:line[@rend='inline']", root)): prev_lines = faust.xpath("./preceding::ge:line", inline_line) if len(prev_lines) == 0: continue prev_line = prev_lines[-1] if inline_line.text is None: inline_line.text = " " else: inline_line.text += " " inline_line.getparent().remove(inline_line) prev_line.append(inline_line) lxml.etree.strip_tags(prev_line, faust.ns("ge") + "line") # convert inline <lb/> to <ge:line/> for lb in list(root.iter(faust.ns("tei") + "lb")): parent = lb.getparent() if parent.tag != (faust.ns("ge") + "line"): continue lb.tag = faust.ns("ge") + "line" lb.text = lb.tail lb.tail = None sibling = lb.getnext() while sibling is not None: next_sibling = sibling.getnext() parent.remove(sibling) lb.append(sibling) sibling = next_sibling parent.remove(lb) parent.addnext(lb) # put <note/> in zones for note in list(root.iter(faust.ns("tei") + "note")): parent = surface if len(faust.xpath(".//ge:line", note)) == 0: parent = lxml.etree.SubElement(parent, faust.ns("tei") + "zone") note.tag = faust.ns("ge") + "line" else: note.tag = faust.ns("tei") + "zone" note.getparent().remove(note) parent.append(note) if "place" in note.attrib: del note.attrib["place"] # u<ex>nd</ex> becomes "und" for ex in root.iter(faust.ns("tei") + "ex"): try: pre_sibling = ex.itersiblings(preceding=True).next() except StopIteration: continue if pre_sibling.text: if re.split("\s+", pre_sibling.text).pop() == "u": # only in two files, do nothing pass # <abbr>u</abbr> becomes "und" for abbr in root.iter(faust.ns("tei") + "abbr"): if abbr.text == "u": tail = abbr.tail if tail: abbr.tail = "und" + tail else: abbr.tail = "und" remove_keep_tail(abbr) #abbr.tag = None # </ex> outside of <abbr/> becomes <supplied/> for ex in root.iter(faust.ns("tei") + "ex"): pass if not list(ex.iterancestors(faust.ns("tei") + "abbr")): ex.tag = faust.ns("tei") + "supplied" # <delSpan/> becomes <f:st/> for delSpan in root.iter(faust.ns("tei") + "delSpan"): delSpan.tag = faust.ns("f") + "st" # detach marginal elements for margin in list(faust.xpath(".//*[@place]", root)): place = margin.get("place") if place not in ("margin",\ "top", "top-left", "topleft", "top-right", "topright",\ "bottom", "bottom-left", "bottomleft", "bottom-right", "bottomright"): continue del margin.attrib["place"] parent = margin.getparent() margin_zone = lxml.etree.Element(faust.ns("tei") + "zone") if place.startswith("top"): surface.insert(0, margin_zone) else: surface.append(margin_zone) margin_parent = margin_zone if margin.tag != faust.ns("ge") + "line": margin_parent = lxml.etree.SubElement(margin_parent, faust.ns("ge") + "line") for ancestor in margin.iterancestors(faust.ns("ge") + "line"): line_id = ancestor.get(faust.ns("xml") + "id", None) if line_id is None: xml_id_cnt += 1 line_id = "line_" + str(xml_id_cnt) ancestor.set(faust.ns("xml") + "id", line_id) margin_zone.set(faust.ns("f") + "top", "#" + line_id) break parent.remove(margin) margin_parent.append(margin) # detach interlinear additions for inter_add in list(faust.xpath(".//tei:add[@place='above' or @place='below']", root)): line = None for ancestor in inter_add.iterancestors(faust.ns("ge") + "line"): line = ancestor break if line is None: raise Exception(lxml.etree.tostring(inter_add)) adjunct_line = None if inter_add.get("place") == "above": adjunct_line = line.getprevious() else: adjunct_line = line.getnext() if (adjunct_line is None) or (adjunct_line.tag != (faust.ns("ge") + "line")) or\ (adjunct_line.get("type", "") != "inter"): adjunct_line = lxml.etree.Element(faust.ns("ge") + "line") adjunct_line.set("type", "inter") if inter_add.get("place") == "above": line.addprevious(adjunct_line) else: line.addnext(adjunct_line) xml_id_cnt += 1 anchor_id = "anchor_" + str(xml_id_cnt) ins_mark = lxml.etree.SubElement(adjunct_line, faust.ns("f") + "ins") ins_mark.set(faust.ns("f") + "at", "#" + anchor_id) ins_mark.tail = inter_add.text inter_add.text = None inter_add.tag = faust.ns("tei") + "anchor" inter_add.set(faust.ns("xml") + "id", anchor_id) for child in inter_add.getchildren(): inter_add.remove(child) adjunct_line.append(child) del inter_add.attrib["place"] # remove remaining <add/> elements lxml.etree.strip_tags(root, faust.ns("tei") + "add") # remove <lb/>s, which are located in zones after conversion for lb in list(root.iter(faust.ns("tei") + "lb")): parent = lb.getparent() if parent.tag == (faust.ns("tei") + "zone"): parent.remove(lb) # convert some attribute values for typed in faust.xpath(".//*[@type='foliation']" , root): typed.set("type", "folioNum") for typed in faust.xpath(".//*[@type='sigel']" , root): typed.set("type", "sig") for typed in faust.xpath(".//*[@type='sigil']" , root): typed.set("type", "sig") # "#_bl", "#_t" u.ä. → "#sc_bl" etc. for any_elem in root.iter(tag=lxml.etree.Element): for attrib in any_elem.attrib: val = any_elem.get(attrib) if val.startswith("#_"): new_val ="#sc" + val[1:] any_elem.set(attrib, new_val) #remove type attributes for certain values for typed in faust.xpath(".//*[@type='instant' or @type='inst' or @type='instantrevision'\ or @type='late' or @type='soon']" , root): del typed.attrib["type"] # ***** Textual transcript ***** for text in faust.xpath(".//tei:text", xml): # remove hand attribute for hand_attributed in faust.xpath(".//*[@hand]", text): del hand_attributed.attrib["hand"] #remove handShifts lxml.etree.strip_tags(text, faust.ns("tei") + "handShift") # convert umlaut corrections corr_or_reg = itertools.chain(text.iter(faust.ns("tei") + "corr"), root.iter(faust.ns("tei") + "reg")) for element in corr_or_reg: for umlaut in umlaut_mapping: if element.text == umlaut: element.text = umlaut_mapping[umlaut] element.tag = faust.ns("tei") + "orig" # write the converted file path = ("conversion_test/" + faust.relative_path(xml_file)).split("/") path[-1] = "conv_" + path[-1] dir_path = "/".join(path[:-1]) if not os.path.isdir(dir_path): os.makedirs(dir_path) xml.write("/".join(path), encoding="UTF-8")
#!/usr/bin/env python import faust import query import lxml.etree import os.path text_xp = faust.xpath("//tei:text") for f in query.matches (query.documentary_by_name(), "//tei:text and not(//ge:document)"): relpath = faust.relative_path(f) xml = lxml.etree.parse(f) text = text_xp(xml)[0] gedocument = lxml.etree.Element(faust.ns("ge") + "document", nsmap=faust.namespaces) surface = lxml.etree.Element(faust.ns("tei") + "surface") gedocument.append(surface) zone = lxml.etree.Element(faust.ns("tei") + "zone") zone.set("type", "main") surface.append(zone) text.addprevious(gedocument) out = os.path.join("/tmp/faust/" + relpath) outdir = os.path.dirname(out) try: os.makedirs (outdir) except: pass xml.write(out, encoding="UTF-8")
# # Compare two directories of TEI files, ignoring whitespace. Replace # tei:text and ge:document elements if nothing significant has changed from __future__ import print_function import io import string import os import sys import lxml.etree import faust import copy import rev_desc txt_xp = faust.xpath("//tei:TEI/tei:text") doc_xp = faust.xpath("//tei:TEI/ge:document") body_xp = faust.xpath("//tei:TEI/tei:text/tei:body") # automatically generated templates template_xp = faust.xpath("//tei:TEI/tei:text/tei:body/tei:div[@type='template']") # False for a dry run replace_new = True def compare_streams(one, two): """Decides if two streams are equal, ignoring whitespace""" char_1 = " "
#!/usr/bin/env python # # Updates the all TEI headers based on a template # import copy import sys import lxml.etree import faust # XPath expressions for extraction of templated header contents handNotes_xp = faust.xpath("//tei:teiHeader/tei:profileDesc/tei:handNotes") charDecl_xp = faust.xpath("//tei:teiHeader/tei:encodingDesc/tei:charDecl") # Get the template and parse it tei_template = faust.absolute_path("template/tei.xml") template = lxml.etree.parse(tei_template) # extract relevant header fragments from template template_hand_notes = handNotes_xp(template)[0] template_char_decl = charDecl_xp(template)[0] def replace(node, with_node): '''Replaces a node with a deep copy of a node (from another document)''' node.getparent().replace(node, copy.deepcopy(with_node)) # iterate over TEI files (excluding the template) for xml_file in faust.xml_files():
#!/usr/bin/env python # coding=UTF-8 # # Add a revision description change to an xml file represented by an ElementTree import lxml.etree import faust import datetime revdesc_xp = faust.xpath("//tei:teiHeader/tei:revisionDesc") header_xp = faust.xpath("//tei:teiHeader") def add_change (xml, who, content, when = datetime.date.today().isoformat()): '''Adds a change element to the revisionDesc in the TEI header''' header = header_xp(xml) if not header: raise ValueError("No TEI header present") # if there is no tei:revisionDesc element, insert one if not revdesc_xp(xml): rev_desc_element = lxml.etree.Element(faust.ns("tei") + "revisionDesc") # revisionDesc always goes to the end of the header header[0].append(rev_desc_element) # build change element attribs = {"when" : when, "who" : who } change = lxml.etree.Element(faust.ns("tei") + "change", attribs)
pages.sort() for p in pages: p_xml = lxml.etree.Element(faust_ns + "materialUnit") p_xml.set("type", "page") p_xml.set("transcript", p) if last is None: document_xml.insert(0, p_xml) else: last.addnext(p_xml) last = p_xml metadata_xml = lxml.etree.Element(faust_ns + "metadata") document_xml.insert(0, metadata_xml) lxml.etree.SubElement(metadata_xml, faust_ns + "archive").text = "gsa" callnumber = faust.xpath("//f:signatur/text()", document_xml)[0] if callnumber in gsa_callnumber_mapping: callnumber = gsa_callnumber_mapping[ callnumber] + " (" + callnumber + ")" lxml.etree.SubElement(metadata_xml, faust_ns + "callnumber").text = callnumber wa_id_matches = faust.xpath("//f:key[@n='25']/following::f:value", document_xml) if (len(wa_id_matches) > 0): wa_id = wa_id_matches[0].text if wa_id != "-" and wa_id != "oS": lxml.etree.SubElement(metadata_xml, faust_ns + "waId").text = wa_id xml_dir = faust.absolute_path("/".join(("document", ) + documents_struct[gsa_ident][0]))
#!/usr/bin/env python # # Report on the transcript status as specified under # # https://faustedition.uni-wuerzburg.de/wiki/index.php/Stand_der_Transkription # import sys import lxml.etree import faust # XPath expression for extracting the revision history from TEI documents change_xp = faust.xpath("//tei:teiHeader//tei:revisionDesc/tei:change") def count(): # status counters status_keys = [key for (key, value) in faust.config.items("log-status")] status_dict = {} for key in status_keys: status_dict[key] = 0 status_keys.sort() status_unknown = 0 # iterate over all TEI documents for xml_file in faust.transcript_files(): status = set() try:
#!/usr/bin/env python import faust import query import lxml.etree import os.path text_xp = faust.xpath("//tei:text") for f in query.matches(query.documentary_by_name(), "//tei:text and not(//ge:document)"): relpath = faust.relative_path(f) xml = lxml.etree.parse(f) text = text_xp(xml)[0] gedocument = lxml.etree.Element(faust.ns("ge") + "document", nsmap=faust.namespaces) surface = lxml.etree.Element(faust.ns("tei") + "surface") gedocument.append(surface) zone = lxml.etree.Element(faust.ns("tei") + "zone") zone.set("type", "main") surface.append(zone) text.addprevious(gedocument) out = os.path.join("/tmp/faust/" + relpath) outdir = os.path.dirname(out) try: os.makedirs(outdir) except:
# # Compare two directories of TEI files, ignoring whitespace. Replace # tei:text and ge:document elements if nothing significant has changed from __future__ import print_function import io import string import os import sys import lxml.etree import faust import copy import rev_desc txt_xp = faust.xpath("//tei:TEI/tei:text") doc_xp = faust.xpath("//tei:TEI/ge:document") body_xp = faust.xpath("//tei:TEI/tei:text/tei:body") #automatically generated templates template_xp = faust.xpath("//tei:TEI/tei:text/tei:body/tei:div[@type='template']") # False for a dry run replace_new = True def compare_streams(one, two): '''Decides if two streams are equal, ignoring whitespace''' char_1 = ' '