Exemple #1
0
def to_convert():
	text_content_xp = faust.xpath("normalize-space(//tei:text)")
	ge_document_content_xp = faust.xpath("normalize-space(//ge:document)")
	has_text_in = lambda xp, xml: (len(" ".join(xp(xml)).strip()) > 0)
	
	to_convert = list()
	for xml_file in faust.xml_files():
		path = faust.relative_path(xml_file).split("/")
		if path[0] != "transcript": continue	
		file_name = path[-1]
		if file_name[:-len(".xml")] == path[-2]: continue
		if int(re.search(r'[0-9]+', file_name).group(0)) == 1: continue
		if not faust.is_tei_document(xml_file):	continue
	
		xml = lxml.etree.parse(xml_file)
		if has_text_in(text_content_xp, xml) and not has_text_in(ge_document_content_xp, xml):
			to_convert.append(xml_file)
	return to_convert
Exemple #2
0
def unique_values(files, xpath):
	''' List all unique values for matches of an xpath.'''
	unique = set()
	for f in files:
		try:
			xml = lxml.etree.parse(f)
			results = [result.tag for result in faust.xpath(xpath)(xml)]
			unique = unique.union(results)
		except lxml.etree.XMLSyntaxError:
			sys.stderr.write("XML syntax error: " + f + "\n")
	return unique
Exemple #3
0
def unique_values(files, xpath):
	''' List all unique values for matches of an xpath.'''
	unique = set()
	for f in files:
		try:
			xml = lxml.etree.parse(f)
			results = faust.xpath(xpath)(xml)
			unique = unique.union(results)
		except lxml.etree.XMLSyntaxError:
			sys.stderr.write("XML syntax error: " + f + "\n")
	return unique
Exemple #4
0
def unique_values(files, xpath):
	''' List all unique values for matches of an xpath.'''
	unique = set()
	def to_str(val):
		try:
			return val.tag
		except AttributeError:
			return str(val)
	for f in files:
		try:
			xml = lxml.etree.parse(f)
			results = [to_str(result) for result in faust.xpath(xpath)(xml)]
			unique = unique.union(results)
		except lxml.etree.XMLSyntaxError as e:
			sys.stderr.write("XML syntax error: " + f + "\n")
			sys.stderr.write(str(e))
	return unique
Exemple #5
0
	def does_match (file):
		try:
			xml = lxml.etree.parse(file)
			return faust.xpath(xpath)(xml)
		except lxml.etree.XMLSyntaxError:
			sys.stderr.write("XML syntax error: " + file + "\n")
		
		last = None
		pages.sort()
		for p in pages:
			p_xml = lxml.etree.Element(faust_ns + "materialUnit")
			p_xml.set("type", "page")
			p_xml.set("transcript", p)
			if last is None:
				document_xml.insert(0, p_xml)
			else:
				last.addnext(p_xml)
			last = p_xml
	metadata_xml = lxml.etree.Element(faust_ns + "metadata")
	document_xml.insert(0, metadata_xml)
	
	lxml.etree.SubElement(metadata_xml, faust_ns + "archive").text = "gsa"
	
	callnumber = faust.xpath("//f:signatur/text()", document_xml)[0]
	if callnumber in gsa_callnumber_mapping:
		callnumber = gsa_callnumber_mapping[callnumber] + " (" + callnumber + ")" 
	lxml.etree.SubElement(metadata_xml, faust_ns + "callnumber").text = callnumber
			
	wa_id_matches = faust.xpath("//f:key[@n='25']/following::f:value", document_xml)
	if (len(wa_id_matches) > 0):
		wa_id = wa_id_matches[0].text
		if wa_id != "-" and wa_id != "oS":
			lxml.etree.SubElement(metadata_xml, faust_ns + "waId").text = wa_id
		
	xml_dir = faust.absolute_path("/".join(("document", ) + documents_struct[gsa_ident][0]))
	if not os.path.isdir(xml_dir): os.makedirs(xml_dir)
	document_xml.getroottree().write("/".join((xml_dir, "gsa_" + gsa_ident + ".xml")), encoding="UTF-8", pretty_print=True)
# Search for missing @xml:space

import lxml.etree

import faust

ignored_tags = (
	"app", "back", "body", "choice", "div", "docTitle", "fix", "front", "fw", "g", 
	"group", "lg", "overw", "patch", "sp", "subst", "surface", "text", "titlePage", "titlePart", 
	"used", "zone")
	
ignored_empty_elems = (
	"addSpan", "anchor", "cb", "certainty", "damageSpan", "delSpan", "gap", "grBrace", "grLine", "handShift", 
	"ins", "join", "lb", "pb", "space", "st", "undo", "p")

element_selector_xp = faust.xpath("//*[(ancestor::tei:text or ancestor::ge:document) and not(@xml:space)]")
text_content_xp = faust.xpath("normalize-space()")

candidates = dict()
for xml_file in faust.xml_files():
	try:
		if faust.is_tei_document(xml_file):
			xml = lxml.etree.parse(xml_file)
			
			xml_key = faust.relative_path(xml_file)
			candidates[xml_key] = []
			
			for elem in element_selector_xp(xml):
				if elem.tag.startswith(faust.ns("svg")): continue
				
				local_name = elem.tag[elem.tag.rfind("}") + 1:]
Exemple #8
0
	def does_match (file):
		try:
			xml = lxml.etree.parse(file)
			return faust.xpath(xpath)(xml)
		except lxml.etree.XMLSyntaxError:
			sys.stderr.write("XML syntax error: " + file + "\n")
Exemple #9
0
#!/usr/bin/env python
# coding=UTF-8
#
# Correct the links to facsimile files
#
import faust
import transform
import lxml
import os
import sys
import rev_desc

doc_template = lxml.etree.parse(
    os.path.join(faust.xml_dir, "template", "tei.xml"))
graphic_xp = faust.xpath("//tei:facsimile/tei:graphic")
header_xp = faust.xpath("/tei:TEI/tei:teiHeader")
valid_graphic_uris = faust.facsimiles()


def xml_names_from_facsimiles():
    prefix_length = len(faust.faust_scheme + "://facsimile/")

    def to_xml_path(facs_uri):
        stripped = facs_uri[prefix_length:]
        return os.path.join(faust.xml_dir, "transcript", stripped + ".xml")

    return map(to_xml_path, faust.facsimiles())


def facs_uri_from_xml(path):
    stripped = path[len(faust.xml_dir + "/facsimile/"):-len(".xml")]
Exemple #10
0
	def facs_invalid(file):
		xml = lxml.etree.parse(file)
		urls = faust.xpath("//tei:facsimile/tei:graphic/@url")(xml)
		for url in urls:
			if url in faust_facsimiles: return True
		return False
#!/usr/bin/env python
#
# Report on the transcript status as specified under
#
# https://faustedition.uni-wuerzburg.de/wiki/index.php/Stand_der_Transkription
#

import sys

import lxml.etree

import faust

# XPath expression for extracting the revision history from TEI documents
ge_doc_xp = faust.xpath("normalize-space(//ge:document)")

# XPath expression for extracting the revision history from TEI documents
change_xp = faust.xpath("//tei:teiHeader//tei:revisionDesc/tei:change")

# iterate over all TEI documents
for xml_file in faust.xml_files():
	status = None
	try:
		if faust.is_tei_document(xml_file):
			xml = lxml.etree.parse(xml_file)
			if len(ge_doc_xp(xml).strip()) == 0: continue
			
			encoded = False
			for change in change_xp(xml):
				change_str = lxml.etree.tostring(change).lower()
				if "encoded" in change_str: encoded = True
#!/usr/bin/env python
# coding=UTF-8
#
# Correct the links to facsimile files
#
import faust
import transform
import lxml
import os
import sys
import rev_desc

doc_template = lxml.etree.parse(os.path.join(faust.xml_dir, "template", "tei.xml"))
graphic_xp = faust.xpath ("//tei:facsimile/tei:graphic")
header_xp = faust.xpath ("/tei:TEI/tei:teiHeader")
valid_graphic_uris = faust.facsimiles()

def xml_names_from_facsimiles():
	prefix_length = len(faust.faust_scheme + "://facsimile/")
	def to_xml_path (facs_uri):
		stripped = facs_uri[prefix_length:]
		return os.path.join(faust.xml_dir, "transcript" , stripped + ".xml")
	return map (to_xml_path, faust.facsimiles())

def facs_uri_from_xml(path):
	stripped = path[len(faust.xml_dir + "/facsimile/") : - len(".xml")]
	return faust.faust_scheme + "://facsimile" + stripped 
	
def make_xml_templates():
	xml_templates = xml_names_from_facsimiles()
	# check if all directories exist
Exemple #13
0
# Search for missing @xml:space

import lxml.etree

import faust

ignored_tags = ("app", "back", "body", "choice", "div", "docTitle", "fix",
                "front", "fw", "g", "group", "lg", "overw", "patch", "sp",
                "subst", "surface", "text", "titlePage", "titlePart", "used",
                "zone")

ignored_empty_elems = ("addSpan", "anchor", "cb", "certainty", "damageSpan",
                       "delSpan", "gap", "grBrace", "grLine", "handShift",
                       "ins", "join", "lb", "pb", "space", "st", "undo", "p")

element_selector_xp = faust.xpath(
    "//*[(ancestor::tei:text or ancestor::ge:document) and not(@xml:space)]")
text_content_xp = faust.xpath("normalize-space()")

candidates = dict()
for xml_file in faust.xml_files():
    try:
        if faust.is_tei_document(xml_file):
            xml = lxml.etree.parse(xml_file)

            xml_key = faust.relative_path(xml_file)
            candidates[xml_key] = []

            for elem in element_selector_xp(xml):
                if elem.tag.startswith(faust.ns("svg")): continue

                local_name = elem.tag[elem.tag.rfind("}") + 1:]
Exemple #14
0
#!/usr/bin/env python
#
# Report on the transcript status as specified under
#
# https://faustedition.uni-wuerzburg.de/wiki/index.php/Stand_der_Transkription
#

import sys

import lxml.etree

import faust

# XPath expression for extracting the revision history from TEI documents
change_xp = faust.xpath("//tei:teiHeader//tei:revisionDesc/tei:change")


def count():
    # status counters
    status_keys = [key for (key, value) in faust.config.items("log-status")]
    status_dict = {}
    for key in status_keys:
        status_dict[key] = 0

    status_keys.sort()

    status_unknown = 0

    # iterate over all TEI documents
    for xml_file in faust.transcript_files():
        status = set()
Exemple #15
0
#!/usr/bin/env python
# coding=UTF-8
#
# Add a revision description change to an xml file represented by an ElementTree

import lxml.etree
import faust
import datetime

revdesc_xp = faust.xpath("//tei:teiHeader/tei:revisionDesc")
header_xp = faust.xpath("//tei:teiHeader")


def add_change(xml, who, content, when=datetime.date.today().isoformat()):
    '''Adds a change element to the revisionDesc in the TEI header'''

    header = header_xp(xml)

    if not header:
        raise ValueError("No TEI header present")

    # if there is no tei:revisionDesc element, insert one
    if not revdesc_xp(xml):
        rev_desc_element = lxml.etree.Element(faust.ns("tei") + "revisionDesc")
        # revisionDesc always goes to the end of the header
        header[0].append(rev_desc_element)

    # build change element
    attribs = {"when": when, "who": who}
    change = lxml.etree.Element(faust.ns("tei") + "change", attribs)
    change.text = content
Exemple #16
0
	def matches_in_file (file):
		try:
			xml = lxml.etree.parse(file)
			return [file, faust.xpath(xpath)(xml)]
		except lxml.etree.XMLSyntaxError:
			sys.stderr.write("XML syntax error: " + file + "\n")
Exemple #17
0
def convert():
	tei_text_xp = faust.xpath("//tei:text")
	xml_id_cnt = 0
	
	for xml_file in [faust.absolute_path(rel) for rel in static_to_convert()]:
		print xml_file

		xml = lxml.etree.parse(xml_file)

		# ***** Documentary transcript *****
		
		# prepare <ge:document/> context
		root = lxml.etree.Element(faust.ns("ge") + "document", nsmap=faust.namespaces)
		root.set(faust.ns("xml") + "id", "converted")
		for child in xml.getroot():
			if child.tag not in (faust.ns("tei") + "teiHeader", faust.ns("tei") + "facsimile"):
				child.addprevious(root)
				break
		
		surface = lxml.etree.SubElement(root, faust.ns("tei") + "surface")
		for body in faust.xpath(".//tei:body", xml):
			body_zone = copy.deepcopy(body)
			body_zone.tag = faust.ns("tei") + "zone"
			surface.append(body_zone)

		root = surface

		# let <add/>/<del/> inherit @hand from <subst/>/<restore/>
		for container_with_hand in faust.xpath(".//tei:subst[@hand]|./tei:restore[@hand]", root):
			hand = container_with_hand.get("hand")
			for add_xml in faust.xpath("./tei:add[count(@hand) = 0]", container_with_hand):
				add_xml.set("hand", hand)
			for del_xml in faust.xpath("./tei:del[count(@hand) = 0]", container_with_hand):
				del_xml.set("hand", hand)
			del container_with_hand.attrib["hand"]
						
		# convert @hand into <handShift/>
		for hand_annotated in faust.xpath(".//*[@hand]", root):
			if hand_annotated.tag not in (faust.ns("tei") + "add", faust.ns("tei") + "fw"): continue
			
			handShifts = faust.xpath("./preceding::tei:handShift", hand_annotated)
			last_hand = (len(handShifts) > 0) and handShifts[-1].get("new") or "#i_have_no_idea"
			
			# start of new hand
			hs = lxml.etree.Element(faust.ns("tei") + "handShift")
			hs.set("new", hand_annotated.get("hand"))
			hs.tail = hand_annotated.text
			
			hand_annotated.text = None
			hand_annotated.insert(0, hs)
			
			
			# reset to last hand
			hs = lxml.etree.Element(faust.ns("tei") + "handShift")
			hs.set("new", last_hand)
			hand_annotated.append(hs)
			
			del hand_annotated.attrib["hand"]
				
		# convert <div/> with @type == "zone"
		for div in root.iter(faust.ns("tei") + "div"):
			if "zone" == div.get("type", ""):
				div.tag = faust.ns("tei") + "zone"
				del div.attrib["type"]

		# convert overwritten parts
		for subst in root.iter(faust.ns("tei") + "subst"):
			att_vals = ["overwrite", "overwriting", "overwritiung"]
			def type_overwr_in_attributes(element): return element.get("type", "") in att_vals
			def rend_overwr_in_attributes(element): return element.get("rend", "") in att_vals
			children_with_type = filter(type_overwr_in_attributes, subst)
			children_with_rend = filter(rend_overwr_in_attributes, subst)
			# type attribute in substitution
			if type_overwr_in_attributes(subst):
				del subst.attrib["type"]
			# rend attribute in substitution
			elif rend_overwr_in_attributes(subst):
				del subst.attrib["rend"]
			# type attribute in a child (in add or del)
			elif children_with_type:
				for child in children_with_type:
					del child.attrib["type"]
			# rend attribute in a child
			elif children_with_rend:
				for child in children_with_rend:
					del child.attrib["rend"]
			else:
				continue
			subst.tag = faust.ns("f") + "overw"
			for del_xml in subst.findall(faust.ns("tei") + "del"):
				del_xml.tag = faust.ns("f") + "under"
			for add in subst.findall(faust.ns("tei") + "add"):
				add.tag = faust.ns("f") + "over"

		# <div type="cleared"/> becomes <ge:used spanTo="#...">
		for div in root.iter(faust.ns("tei") + "div"):
			if "type" in div.attrib:
				if div.attrib["type"] == "cleared":
					used = lxml.etree.Element(faust.ns("ge") + "used")
					div.addprevious(used)
					xml_id_cnt += 1
					anchor_id = str(xml_id_cnt)
					used.set("spanTo", "#" + anchor_id)
					for child in div.getchildren():
						div.remove(child)
						div.addprevious(child)
					div.tag = faust.ns("tei") + "anchor"
					div.set(faust.ns("xml") + "id", anchor_id)

		# throw away text structure tagging
		lxml.etree.strip_tags(root,\
			faust.ns("tei") + "div", faust.ns("tei") + "lg",\
			faust.ns("tei") + "sp", faust.ns("tei") + "subst",\
			faust.ns("tei") + "name", faust.ns("tei") + "addSpan")

		# remove Schroer numbers
		for l in root.iter(faust.ns("tei") + "l"): 
			if "n" in l.attrib: del l.attrib["n"]
		
		# create simple lines
		for line_element in ("speaker", "l", "p", "stage", "head", "ab"):
			line_elements = list(root.iter(faust.ns("tei") + line_element))
			for le in line_elements:
				if le.get("rend", "") in ["underline", "underlined", "centered unterline"]:
					hi = copy.deepcopy(le)
					hi.tag = faust.ns("tei") + "hi"
					le.clear()
					for attr in list(hi.attrib.keys()):
						if attr == "rend": continue
						le.set(attr, hi.get(attr))
						del hi.attrib[attr]
					le.append(hi)
				le.tag = faust.ns("ge") + "line"
		
		# turn deletions into <f:st/> by default
		for del_xml in root.iter(faust.ns("tei") + "del"):
			del_xml.tag = faust.ns("f") + "st"
			del_type = del_xml.get("rend", "")
			if del_type == "strikethrough" or del_type == "strikedthrough": 
				del del_xml.attrib["rend"]
			
		# rename tags for fixations
		for rewrite_tag in ("fix", "repetition"):
			for rewrite in root.iter(faust.ns("tei") + rewrite_tag):
				rewrite.tag = faust.ns("ge") + "rewrite"


		# rename semantic tags with @rend="underline"
		for sem_hi_tag in ("emph", "name"):
			for sem_hi in root.iter(faust.ns("tei") + sem_hi_tag):
				if sem_hi.get("rend", "") == "underline":
					sem_hi.tag = faust.ns("tei") + "hi"
		
		# convert umlaut corrections
		umlaut_mapping = { 
			u"ä":u"a", u"Ä":u"A", 
			u"ö":u"o", u"Ö":u"O", 
			u"ü":u"u", u"Ü":u"U" 
			}
		corr_or_reg = itertools.chain(root.iter(faust.ns("tei") + "corr"), root.iter(faust.ns("tei") + "reg"))
		for element in corr_or_reg:
			for umlaut in umlaut_mapping:
				if element.text == umlaut:
					element.text = umlaut_mapping[umlaut]
					element.tag = faust.ns("tei") + "orig"
					
		# join lines with @rend='inline'
		for inline_line in list(faust.xpath(".//ge:line[@rend='inline']", root)):
			prev_lines = faust.xpath("./preceding::ge:line", inline_line)
			if len(prev_lines) == 0: continue
			prev_line = prev_lines[-1]
			
			if inline_line.text is None: 
				inline_line.text = " "
			else:
				inline_line.text += " "				
			inline_line.getparent().remove(inline_line)
			prev_line.append(inline_line)
			lxml.etree.strip_tags(prev_line, faust.ns("ge") + "line")
			
		# convert inline <lb/> to <ge:line/>
		for lb in list(root.iter(faust.ns("tei") + "lb")):
			parent = lb.getparent()
			if parent.tag != (faust.ns("ge") + "line"): continue

			lb.tag = faust.ns("ge") + "line"
			lb.text = lb.tail
			lb.tail = None
			sibling = lb.getnext()
			while sibling is not None:
				next_sibling = sibling.getnext()
				parent.remove(sibling)
				lb.append(sibling)
				sibling = next_sibling			
			parent.remove(lb)
			parent.addnext(lb)
		
		# put <note/> in zones		
		for note in list(root.iter(faust.ns("tei") + "note")):
			parent = surface
			if len(faust.xpath(".//ge:line", note)) == 0:
				parent = lxml.etree.SubElement(parent, faust.ns("tei") + "zone")
				note.tag = faust.ns("ge") + "line"
			else:
				note.tag = faust.ns("tei") + "zone"
			note.getparent().remove(note)			
			parent.append(note)
			if "place" in note.attrib: del note.attrib["place"]


		# u<ex>nd</ex> becomes "und"
		for ex in root.iter(faust.ns("tei") + "ex"):
			try: pre_sibling = ex.itersiblings(preceding=True).next()
			except StopIteration: continue
			if pre_sibling.text:
				if re.split("\s+", pre_sibling.text).pop() == "u":
					# only in two files, do nothing
					pass

		# <abbr>u</abbr> becomes "und"
		for abbr in root.iter(faust.ns("tei") + "abbr"):
			if abbr.text == "u":
				tail = abbr.tail
				if tail: abbr.tail = "und" + tail
				else: abbr.tail = "und"
				remove_keep_tail(abbr)
				#abbr.tag = None

		# </ex> outside of <abbr/> becomes <supplied/>
		for ex in root.iter(faust.ns("tei") + "ex"):
			pass
			if not list(ex.iterancestors(faust.ns("tei") + "abbr")):
				ex.tag = faust.ns("tei") + "supplied"

		# <delSpan/> becomes <f:st/>
		for delSpan in root.iter(faust.ns("tei") + "delSpan"):
			delSpan.tag = faust.ns("f") + "st"

		

		# detach marginal elements
		for margin in list(faust.xpath(".//*[@place]", root)):
			place = margin.get("place")
			if place not in ("margin",\
			 	"top", "top-left", "topleft", "top-right", "topright",\
				"bottom", "bottom-left", "bottomleft", "bottom-right", "bottomright"):
				continue

			del margin.attrib["place"]
			parent = margin.getparent()
			
			margin_zone = lxml.etree.Element(faust.ns("tei") + "zone")
			if place.startswith("top"):
				surface.insert(0, margin_zone)
			else:
				surface.append(margin_zone)
				
			margin_parent = margin_zone
			if margin.tag != faust.ns("ge") + "line":
				margin_parent = lxml.etree.SubElement(margin_parent, faust.ns("ge") + "line")
				
			for ancestor in margin.iterancestors(faust.ns("ge") + "line"):
				line_id = ancestor.get(faust.ns("xml") + "id", None)
				if line_id is None:
					xml_id_cnt += 1
					line_id = "line_" + str(xml_id_cnt)
					ancestor.set(faust.ns("xml") + "id", line_id)
				margin_zone.set(faust.ns("f") + "top", "#" + line_id)
				break
			
			parent.remove(margin)
			margin_parent.append(margin)
			
		# detach interlinear additions
		for inter_add in list(faust.xpath(".//tei:add[@place='above' or @place='below']", root)):
			line = None
			for ancestor in inter_add.iterancestors(faust.ns("ge") + "line"):
				line = ancestor
				break
			if line is None: raise Exception(lxml.etree.tostring(inter_add))
			
			adjunct_line = None
			if inter_add.get("place") == "above":
				adjunct_line = line.getprevious()
			else:
				adjunct_line = line.getnext()
			if (adjunct_line is None) or (adjunct_line.tag != (faust.ns("ge") + "line")) or\
				(adjunct_line.get("type", "") != "inter"):
				adjunct_line = lxml.etree.Element(faust.ns("ge") + "line")
				adjunct_line.set("type", "inter")
				if inter_add.get("place") == "above":
					line.addprevious(adjunct_line)
				else:
					line.addnext(adjunct_line)
			
			xml_id_cnt += 1
			anchor_id = "anchor_" + str(xml_id_cnt)
			
			ins_mark = lxml.etree.SubElement(adjunct_line, faust.ns("f") + "ins")
			ins_mark.set(faust.ns("f") + "at", "#" + anchor_id)
			
			ins_mark.tail = inter_add.text
			inter_add.text = None
			inter_add.tag = faust.ns("tei") + "anchor"
			inter_add.set(faust.ns("xml") + "id", anchor_id)
			for child in inter_add.getchildren():
				inter_add.remove(child)
				adjunct_line.append(child)
			del inter_add.attrib["place"]
			
		# remove remaining <add/> elements
		lxml.etree.strip_tags(root, faust.ns("tei") + "add")
		
		# remove <lb/>s, which are located in zones after conversion
		for lb in list(root.iter(faust.ns("tei") + "lb")):
			parent = lb.getparent()
			if parent.tag == (faust.ns("tei") + "zone"):
				parent.remove(lb)

			
		

		# convert some attribute values
		for typed in faust.xpath(".//*[@type='foliation']" , root):
			typed.set("type", "folioNum")
		for typed in faust.xpath(".//*[@type='sigel']" , root):
			typed.set("type", "sig")
		for typed in faust.xpath(".//*[@type='sigil']" , root):
			typed.set("type", "sig")

		# "#_bl", "#_t" u.ä. → "#sc_bl" etc.
		for any_elem in root.iter(tag=lxml.etree.Element):
			for attrib in any_elem.attrib:
				val = any_elem.get(attrib)
				if val.startswith("#_"):
					new_val ="#sc" + val[1:]
					any_elem.set(attrib, new_val)
				
		#remove type attributes for certain values
		for typed in faust.xpath(".//*[@type='instant' or @type='inst' or @type='instantrevision'\
		or @type='late' or @type='soon']" , root):
			del typed.attrib["type"]

		
		# ***** Textual transcript *****
		
		for text in faust.xpath(".//tei:text", xml):

			# remove hand attribute
			for hand_attributed in faust.xpath(".//*[@hand]", text):
				del hand_attributed.attrib["hand"]
			#remove handShifts
			lxml.etree.strip_tags(text, faust.ns("tei") + "handShift")

			# convert umlaut corrections
			corr_or_reg = itertools.chain(text.iter(faust.ns("tei") + "corr"), root.iter(faust.ns("tei") + "reg"))
			for element in corr_or_reg:
				for umlaut in umlaut_mapping:
					if element.text == umlaut:
						element.text = umlaut_mapping[umlaut]
						element.tag = faust.ns("tei") + "orig"

		# write the converted file
		path = ("conversion_test/" + faust.relative_path(xml_file)).split("/")
		path[-1] = "conv_" + path[-1]
		dir_path = "/".join(path[:-1])
		if not os.path.isdir(dir_path): os.makedirs(dir_path)

		xml.write("/".join(path), encoding="UTF-8")
#!/usr/bin/env python

import faust
import query
import lxml.etree
import os.path

text_xp = faust.xpath("//tei:text")

for f in query.matches (query.documentary_by_name(), "//tei:text and not(//ge:document)"):
	relpath = faust.relative_path(f)
	
	xml = lxml.etree.parse(f)
	text = text_xp(xml)[0]

	gedocument = lxml.etree.Element(faust.ns("ge") + "document", nsmap=faust.namespaces)
	surface = lxml.etree.Element(faust.ns("tei") + "surface")
	gedocument.append(surface)
	zone = lxml.etree.Element(faust.ns("tei") + "zone")
	zone.set("type", "main")
	surface.append(zone)

	text.addprevious(gedocument)

	out = os.path.join("/tmp/faust/" + relpath)
	outdir = os.path.dirname(out)
	try:
		os.makedirs (outdir)
	except:
		pass
	xml.write(out, encoding="UTF-8")
#
# Compare two  directories of TEI files,  ignoring whitespace. Replace
# tei:text and ge:document elements if nothing significant has changed


from __future__ import print_function
import io
import string
import os
import sys
import lxml.etree
import faust
import copy
import rev_desc

txt_xp = faust.xpath("//tei:TEI/tei:text")
doc_xp = faust.xpath("//tei:TEI/ge:document")
body_xp = faust.xpath("//tei:TEI/tei:text/tei:body")

# automatically generated templates
template_xp = faust.xpath("//tei:TEI/tei:text/tei:body/tei:div[@type='template']")


# False for a dry run
replace_new = True


def compare_streams(one, two):
    """Decides if two streams are equal, ignoring whitespace"""

    char_1 = " "
Exemple #20
0
#!/usr/bin/env python
#
# Updates the all TEI headers based on a template
#

import copy
import sys

import lxml.etree

import faust

# XPath expressions for extraction of templated header contents
handNotes_xp = faust.xpath("//tei:teiHeader/tei:profileDesc/tei:handNotes")
charDecl_xp = faust.xpath("//tei:teiHeader/tei:encodingDesc/tei:charDecl")

# Get the template and parse it
tei_template = faust.absolute_path("template/tei.xml")
template = lxml.etree.parse(tei_template)

# extract relevant header fragments from template
template_hand_notes = handNotes_xp(template)[0]
template_char_decl = charDecl_xp(template)[0]


def replace(node, with_node):
	'''Replaces a node with a deep copy of a node (from another document)'''
	node.getparent().replace(node, copy.deepcopy(with_node))

# iterate over TEI files (excluding the template)
for xml_file in faust.xml_files():
Exemple #21
0
#!/usr/bin/env python
#
# Report on the transcript status as specified under
#
# https://faustedition.uni-wuerzburg.de/wiki/index.php/Stand_der_Transkription
#

import sys

import lxml.etree

import faust

# XPath expression for extracting the revision history from TEI documents
ge_doc_xp = faust.xpath("normalize-space(//ge:document)")

# XPath expression for extracting the revision history from TEI documents
change_xp = faust.xpath("//tei:teiHeader//tei:revisionDesc/tei:change")

# iterate over all TEI documents
for xml_file in faust.xml_files():
    status = None
    try:
        if faust.is_tei_document(xml_file):
            xml = lxml.etree.parse(xml_file)
            if len(ge_doc_xp(xml).strip()) == 0: continue

            encoded = False
            for change in change_xp(xml):
                change_str = lxml.etree.tostring(change).lower()
                if "encoded" in change_str: encoded = True
Exemple #22
0
#!/usr/bin/env python
# coding=UTF-8
#
# Add a revision description change to an xml file represented by an ElementTree

import lxml.etree
import faust
import datetime

revdesc_xp = faust.xpath("//tei:teiHeader/tei:revisionDesc")
header_xp = faust.xpath("//tei:teiHeader")

def add_change (xml, who, content, when = datetime.date.today().isoformat()):
	'''Adds a change element to the revisionDesc in the TEI header'''

	header = header_xp(xml)

	if not header:
		raise ValueError("No TEI header present")

	# if there is no tei:revisionDesc element, insert one
	if not revdesc_xp(xml):
		rev_desc_element = lxml.etree.Element(faust.ns("tei") + "revisionDesc")
		# revisionDesc always goes to the end of the header
		header[0].append(rev_desc_element)


	# build change element
	attribs = {"when" : when,
		   "who" : who }
	change = lxml.etree.Element(faust.ns("tei") + "change", attribs)
Exemple #23
0
	def matches_in_file (file):
		try:
			xml = lxml.etree.parse(file)
			return [file, faust.xpath(xpath)(xml)]
		except lxml.etree.XMLSyntaxError:
			sys.stderr.write("XML syntax error: " + file + "\n")
Exemple #24
0
        pages.sort()
        for p in pages:
            p_xml = lxml.etree.Element(faust_ns + "materialUnit")
            p_xml.set("type", "page")
            p_xml.set("transcript", p)
            if last is None:
                document_xml.insert(0, p_xml)
            else:
                last.addnext(p_xml)
            last = p_xml
    metadata_xml = lxml.etree.Element(faust_ns + "metadata")
    document_xml.insert(0, metadata_xml)

    lxml.etree.SubElement(metadata_xml, faust_ns + "archive").text = "gsa"

    callnumber = faust.xpath("//f:signatur/text()", document_xml)[0]
    if callnumber in gsa_callnumber_mapping:
        callnumber = gsa_callnumber_mapping[
            callnumber] + " (" + callnumber + ")"
    lxml.etree.SubElement(metadata_xml,
                          faust_ns + "callnumber").text = callnumber

    wa_id_matches = faust.xpath("//f:key[@n='25']/following::f:value",
                                document_xml)
    if (len(wa_id_matches) > 0):
        wa_id = wa_id_matches[0].text
        if wa_id != "-" and wa_id != "oS":
            lxml.etree.SubElement(metadata_xml, faust_ns + "waId").text = wa_id

    xml_dir = faust.absolute_path("/".join(("document", ) +
                                           documents_struct[gsa_ident][0]))
Exemple #25
0
	def facs_invalid(file):
		xml = lxml.etree.parse(file)
		urls = faust.xpath("//tei:facsimile/tei:graphic/@url")(xml)
		for url in urls:
			if url in faust_facsimiles: return True
		return False
#!/usr/bin/env python
#
# Report on the transcript status as specified under
#
# https://faustedition.uni-wuerzburg.de/wiki/index.php/Stand_der_Transkription
#

import sys

import lxml.etree

import faust

# XPath expression for extracting the revision history from TEI documents
change_xp = faust.xpath("//tei:teiHeader//tei:revisionDesc/tei:change")

def count():
	# status counters
	status_keys = [key for (key, value) in faust.config.items("log-status")]
	status_dict = {}
	for key in status_keys:
		status_dict[key] = 0

	status_keys.sort()

	status_unknown = 0

	# iterate over all TEI documents
	for xml_file in faust.transcript_files():
		status = set()
		try:
Exemple #27
0
#!/usr/bin/env python

import faust
import query
import lxml.etree
import os.path

text_xp = faust.xpath("//tei:text")

for f in query.matches(query.documentary_by_name(),
                       "//tei:text and not(//ge:document)"):
    relpath = faust.relative_path(f)

    xml = lxml.etree.parse(f)
    text = text_xp(xml)[0]

    gedocument = lxml.etree.Element(faust.ns("ge") + "document",
                                    nsmap=faust.namespaces)
    surface = lxml.etree.Element(faust.ns("tei") + "surface")
    gedocument.append(surface)
    zone = lxml.etree.Element(faust.ns("tei") + "zone")
    zone.set("type", "main")
    surface.append(zone)

    text.addprevious(gedocument)

    out = os.path.join("/tmp/faust/" + relpath)
    outdir = os.path.dirname(out)
    try:
        os.makedirs(outdir)
    except:
Exemple #28
0
#
# Compare two  directories of TEI files,  ignoring whitespace. Replace
# tei:text and ge:document elements if nothing significant has changed


from __future__ import print_function
import io
import string
import os
import sys
import lxml.etree
import faust
import copy
import rev_desc

txt_xp = faust.xpath("//tei:TEI/tei:text")
doc_xp = faust.xpath("//tei:TEI/ge:document")
body_xp = faust.xpath("//tei:TEI/tei:text/tei:body")

#automatically generated templates
template_xp = faust.xpath("//tei:TEI/tei:text/tei:body/tei:div[@type='template']")



# False for a dry run
replace_new = True

def compare_streams(one, two):
	'''Decides if two streams are equal, ignoring whitespace'''

	char_1 = ' '