def tei_transform (tei_file, transform_etree): try: if not faust.is_tei_document(tei_file): sys.stderr.write("Not a TEI file: " + file + "\n") return xml = lxml.etree.parse(tei_file) result = transform_etree(xml) faust.tei_serialize(result).write(tei_file, encoding="UTF-8") except IOError: sys.stderr.write("I/O error while transforming " + tei_file + "\n") except lxml.etree.XMLSyntaxError: sys.stderr.write("XML syntax error while transforming " + tei_file + "\n")
def to_convert(): text_content_xp = faust.xpath("normalize-space(//tei:text)") ge_document_content_xp = faust.xpath("normalize-space(//ge:document)") has_text_in = lambda xp, xml: (len(" ".join(xp(xml)).strip()) > 0) to_convert = list() for xml_file in faust.xml_files(): path = faust.relative_path(xml_file).split("/") if path[0] != "transcript": continue file_name = path[-1] if file_name[:-len(".xml")] == path[-2]: continue if int(re.search(r'[0-9]+', file_name).group(0)) == 1: continue if not faust.is_tei_document(xml_file): continue xml = lxml.etree.parse(xml_file) if has_text_in(text_content_xp, xml) and not has_text_in(ge_document_content_xp, xml): to_convert.append(xml_file) return to_convert
def count(): # status counters status_keys = [key for (key, value) in faust.config.items("log-status")] status_dict = {} for key in status_keys: status_dict[key] = 0 status_keys.sort() status_unknown = 0 # iterate over all TEI documents for xml_file in faust.transcript_files(): status = set() try: if faust.is_tei_document(xml_file): xml = lxml.etree.parse(xml_file) # iterate over all change records, searching for a status remark and select the last one for change in change_xp(xml): change_str = lxml.etree.tostring(change).lower().strip() for candidate in [key.strip() for key in status_keys]: if candidate in change_str: status.add(candidate) except IOError: sys.stderr.write("I/O error while extracting status from " + xml_file + "\n") except lxml.etree.XMLSyntaxError: sys.stderr.write("XML error while extracting status from " + xml_file + "\n") if len(status) == 0: # no status given status_unknown += 1 else: for s in status: # increment relevant status entry status_dict[s] += 1 return status_dict, status_unknown
ignored_tags = ( "app", "back", "body", "choice", "div", "docTitle", "fix", "front", "fw", "g", "group", "lg", "overw", "patch", "sp", "subst", "surface", "text", "titlePage", "titlePart", "used", "zone") ignored_empty_elems = ( "addSpan", "anchor", "cb", "certainty", "damageSpan", "delSpan", "gap", "grBrace", "grLine", "handShift", "ins", "join", "lb", "pb", "space", "st", "undo", "p") element_selector_xp = faust.xpath("//*[(ancestor::tei:text or ancestor::ge:document) and not(@xml:space)]") text_content_xp = faust.xpath("normalize-space()") candidates = dict() for xml_file in faust.xml_files(): try: if faust.is_tei_document(xml_file): xml = lxml.etree.parse(xml_file) xml_key = faust.relative_path(xml_file) candidates[xml_key] = [] for elem in element_selector_xp(xml): if elem.tag.startswith(faust.ns("svg")): continue local_name = elem.tag[elem.tag.rfind("}") + 1:] if local_name in ignored_tags: continue empty_elem = elem.text is None and len(elem) == 0 if empty_elem and local_name in ignored_empty_elems: continue text_content = text_content_xp(elem)
import sys import lxml.etree import faust # XPath expression for extracting the revision history from TEI documents ge_doc_xp = faust.xpath("normalize-space(//ge:document)") # XPath expression for extracting the revision history from TEI documents change_xp = faust.xpath("//tei:teiHeader//tei:revisionDesc/tei:change") # iterate over all TEI documents for xml_file in faust.xml_files(): status = None try: if faust.is_tei_document(xml_file): xml = lxml.etree.parse(xml_file) if len(ge_doc_xp(xml).strip()) == 0: continue encoded = False for change in change_xp(xml): change_str = lxml.etree.tostring(change).lower() if "encoded" in change_str: encoded = True if not encoded: print faust.relative_path(xml_file) except IOError: sys.stderr.write("I/O error while extracting status from " + xml_file + "\n")
# Get the template and parse it tei_template = faust.absolute_path("template/tei.xml") template = lxml.etree.parse(tei_template) # extract relevant header fragments from template template_hand_notes = handNotes_xp(template)[0] template_char_decl = charDecl_xp(template)[0] def replace(node, with_node): '''Replaces a node with a deep copy of a node (from another document)''' node.getparent().replace(node, copy.deepcopy(with_node)) # iterate over TEI files (excluding the template) for xml_file in faust.xml_files(): try: if (xml_file != tei_template) and faust.is_tei_document(xml_file): xml = lxml.etree.parse(xml_file) # replace header fragments for hand_notes in handNotes_xp(xml): replace(hand_notes, template_hand_notes) for char_decl in charDecl_xp(xml): replace(char_decl, template_char_decl) # write back updated document faust.tei_serialize(xml).write(xml_file, encoding="UTF-8") except IOError: sys.stderr.write("I/O error while updating " + xml_file + "\n") except lxml.etree.XMLSyntaxError: sys.stderr.write("XML syntax error while updating " + xml_file + "\n")