Esempio n. 1
0
def to_convert():
	text_content_xp = faust.xpath("normalize-space(//tei:text)")
	ge_document_content_xp = faust.xpath("normalize-space(//ge:document)")
	has_text_in = lambda xp, xml: (len(" ".join(xp(xml)).strip()) > 0)
	
	to_convert = list()
	for xml_file in faust.xml_files():
		path = faust.relative_path(xml_file).split("/")
		if path[0] != "transcript": continue	
		file_name = path[-1]
		if file_name[:-len(".xml")] == path[-2]: continue
		if int(re.search(r'[0-9]+', file_name).group(0)) == 1: continue
		if not faust.is_tei_document(xml_file):	continue
	
		xml = lxml.etree.parse(xml_file)
		if has_text_in(text_content_xp, xml) and not has_text_in(ge_document_content_xp, xml):
			to_convert.append(xml_file)
	return to_convert
Esempio n. 2
0
def validate(last=False):
	"""Validates the content of the queue by calling Jing and parsing its output"""
	global validation_queue
	if not last and len(validation_queue) < validation_queue_max_length: return

	validation = subprocess.Popen(shlex.split(validation_command) + validation_queue, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
	validation_result = validation.communicate()[0] or ""
	for msg_match in validation_msg_re.finditer(validation_result):
		xml_file, line, column, msg = msg_match.groups()
		
		xml_file = faust.relative_path(xml_file)
		error_msg = "[%s:%s] %s" % (line, column, msg)
		
		if xml_file in validation_report: 
			validation_report[xml_file].append(error_msg)
		else:
			validation_report[xml_file] = [error_msg]

	validation_queue = []
Esempio n. 3
0
#!/usr/bin/env python

import faust
import query
import lxml.etree
import os.path

text_xp = faust.xpath("//tei:text")

for f in query.matches(query.documentary_by_name(),
                       "//tei:text and not(//ge:document)"):
    relpath = faust.relative_path(f)

    xml = lxml.etree.parse(f)
    text = text_xp(xml)[0]

    gedocument = lxml.etree.Element(faust.ns("ge") + "document",
                                    nsmap=faust.namespaces)
    surface = lxml.etree.Element(faust.ns("tei") + "surface")
    gedocument.append(surface)
    zone = lxml.etree.Element(faust.ns("tei") + "zone")
    zone.set("type", "main")
    surface.append(zone)

    text.addprevious(gedocument)

    out = os.path.join("/tmp/faust/" + relpath)
    outdir = os.path.dirname(out)
    try:
        os.makedirs(outdir)
    except:
Esempio n. 4
0
	"used", "zone")
	
ignored_empty_elems = (
	"addSpan", "anchor", "cb", "certainty", "damageSpan", "delSpan", "gap", "grBrace", "grLine", "handShift", 
	"ins", "join", "lb", "pb", "space", "st", "undo", "p")

element_selector_xp = faust.xpath("//*[(ancestor::tei:text or ancestor::ge:document) and not(@xml:space)]")
text_content_xp = faust.xpath("normalize-space()")

candidates = dict()
for xml_file in faust.xml_files():
	try:
		if faust.is_tei_document(xml_file):
			xml = lxml.etree.parse(xml_file)
			
			xml_key = faust.relative_path(xml_file)
			candidates[xml_key] = []
			
			for elem in element_selector_xp(xml):
				if elem.tag.startswith(faust.ns("svg")): continue
				
				local_name = elem.tag[elem.tag.rfind("}") + 1:]
				if local_name in ignored_tags: continue
				
				empty_elem = elem.text is None and len(elem) == 0
				if empty_elem and local_name in ignored_empty_elems: continue
				
				text_content = text_content_xp(elem)
				if empty_elem or (len(text_content) > 0 and len(text_content.strip()) == 0):
					candidates[xml_key].append(lxml.etree.tostring(elem))
	except IOError:
Esempio n. 5
0
	def d_b_n(file):
		rel = faust.relative_path(file)
		return not textual.match(rel)
Esempio n. 6
0
import sys

import lxml.etree

import faust

# XPath expression for extracting the revision history from TEI documents
ge_doc_xp = faust.xpath("normalize-space(//ge:document)")

# XPath expression for extracting the revision history from TEI documents
change_xp = faust.xpath("//tei:teiHeader//tei:revisionDesc/tei:change")

# iterate over all TEI documents
for xml_file in faust.xml_files():
    status = None
    try:
        if faust.is_tei_document(xml_file):
            xml = lxml.etree.parse(xml_file)
            if len(ge_doc_xp(xml).strip()) == 0: continue

            encoded = False
            for change in change_xp(xml):
                change_str = lxml.etree.tostring(change).lower()
                if "encoded" in change_str: encoded = True
            if not encoded:
                print faust.relative_path(xml_file)

    except IOError:
        sys.stderr.write("I/O error while extracting status from " + xml_file +
                         "\n")
Esempio n. 7
0
File: query.py Progetto: wmbr/app
	def d_b_n(file):
		rel = faust.relative_path(file)
		return not textual.match(rel)
#!/usr/bin/env python

import faust
import query
import lxml.etree
import os.path

text_xp = faust.xpath("//tei:text")

for f in query.matches (query.documentary_by_name(), "//tei:text and not(//ge:document)"):
	relpath = faust.relative_path(f)
	
	xml = lxml.etree.parse(f)
	text = text_xp(xml)[0]

	gedocument = lxml.etree.Element(faust.ns("ge") + "document", nsmap=faust.namespaces)
	surface = lxml.etree.Element(faust.ns("tei") + "surface")
	gedocument.append(surface)
	zone = lxml.etree.Element(faust.ns("tei") + "zone")
	zone.set("type", "main")
	surface.append(zone)

	text.addprevious(gedocument)

	out = os.path.join("/tmp/faust/" + relpath)
	outdir = os.path.dirname(out)
	try:
		os.makedirs (outdir)
	except:
		pass
	xml.write(out, encoding="UTF-8")
Esempio n. 9
0
def convert():
	tei_text_xp = faust.xpath("//tei:text")
	xml_id_cnt = 0
	
	for xml_file in [faust.absolute_path(rel) for rel in static_to_convert()]:
		print xml_file

		xml = lxml.etree.parse(xml_file)

		# ***** Documentary transcript *****
		
		# prepare <ge:document/> context
		root = lxml.etree.Element(faust.ns("ge") + "document", nsmap=faust.namespaces)
		root.set(faust.ns("xml") + "id", "converted")
		for child in xml.getroot():
			if child.tag not in (faust.ns("tei") + "teiHeader", faust.ns("tei") + "facsimile"):
				child.addprevious(root)
				break
		
		surface = lxml.etree.SubElement(root, faust.ns("tei") + "surface")
		for body in faust.xpath(".//tei:body", xml):
			body_zone = copy.deepcopy(body)
			body_zone.tag = faust.ns("tei") + "zone"
			surface.append(body_zone)

		root = surface

		# let <add/>/<del/> inherit @hand from <subst/>/<restore/>
		for container_with_hand in faust.xpath(".//tei:subst[@hand]|./tei:restore[@hand]", root):
			hand = container_with_hand.get("hand")
			for add_xml in faust.xpath("./tei:add[count(@hand) = 0]", container_with_hand):
				add_xml.set("hand", hand)
			for del_xml in faust.xpath("./tei:del[count(@hand) = 0]", container_with_hand):
				del_xml.set("hand", hand)
			del container_with_hand.attrib["hand"]
						
		# convert @hand into <handShift/>
		for hand_annotated in faust.xpath(".//*[@hand]", root):
			if hand_annotated.tag not in (faust.ns("tei") + "add", faust.ns("tei") + "fw"): continue
			
			handShifts = faust.xpath("./preceding::tei:handShift", hand_annotated)
			last_hand = (len(handShifts) > 0) and handShifts[-1].get("new") or "#i_have_no_idea"
			
			# start of new hand
			hs = lxml.etree.Element(faust.ns("tei") + "handShift")
			hs.set("new", hand_annotated.get("hand"))
			hs.tail = hand_annotated.text
			
			hand_annotated.text = None
			hand_annotated.insert(0, hs)
			
			
			# reset to last hand
			hs = lxml.etree.Element(faust.ns("tei") + "handShift")
			hs.set("new", last_hand)
			hand_annotated.append(hs)
			
			del hand_annotated.attrib["hand"]
				
		# convert <div/> with @type == "zone"
		for div in root.iter(faust.ns("tei") + "div"):
			if "zone" == div.get("type", ""):
				div.tag = faust.ns("tei") + "zone"
				del div.attrib["type"]

		# convert overwritten parts
		for subst in root.iter(faust.ns("tei") + "subst"):
			att_vals = ["overwrite", "overwriting", "overwritiung"]
			def type_overwr_in_attributes(element): return element.get("type", "") in att_vals
			def rend_overwr_in_attributes(element): return element.get("rend", "") in att_vals
			children_with_type = filter(type_overwr_in_attributes, subst)
			children_with_rend = filter(rend_overwr_in_attributes, subst)
			# type attribute in substitution
			if type_overwr_in_attributes(subst):
				del subst.attrib["type"]
			# rend attribute in substitution
			elif rend_overwr_in_attributes(subst):
				del subst.attrib["rend"]
			# type attribute in a child (in add or del)
			elif children_with_type:
				for child in children_with_type:
					del child.attrib["type"]
			# rend attribute in a child
			elif children_with_rend:
				for child in children_with_rend:
					del child.attrib["rend"]
			else:
				continue
			subst.tag = faust.ns("f") + "overw"
			for del_xml in subst.findall(faust.ns("tei") + "del"):
				del_xml.tag = faust.ns("f") + "under"
			for add in subst.findall(faust.ns("tei") + "add"):
				add.tag = faust.ns("f") + "over"

		# <div type="cleared"/> becomes <ge:used spanTo="#...">
		for div in root.iter(faust.ns("tei") + "div"):
			if "type" in div.attrib:
				if div.attrib["type"] == "cleared":
					used = lxml.etree.Element(faust.ns("ge") + "used")
					div.addprevious(used)
					xml_id_cnt += 1
					anchor_id = str(xml_id_cnt)
					used.set("spanTo", "#" + anchor_id)
					for child in div.getchildren():
						div.remove(child)
						div.addprevious(child)
					div.tag = faust.ns("tei") + "anchor"
					div.set(faust.ns("xml") + "id", anchor_id)

		# throw away text structure tagging
		lxml.etree.strip_tags(root,\
			faust.ns("tei") + "div", faust.ns("tei") + "lg",\
			faust.ns("tei") + "sp", faust.ns("tei") + "subst",\
			faust.ns("tei") + "name", faust.ns("tei") + "addSpan")

		# remove Schroer numbers
		for l in root.iter(faust.ns("tei") + "l"): 
			if "n" in l.attrib: del l.attrib["n"]
		
		# create simple lines
		for line_element in ("speaker", "l", "p", "stage", "head", "ab"):
			line_elements = list(root.iter(faust.ns("tei") + line_element))
			for le in line_elements:
				if le.get("rend", "") in ["underline", "underlined", "centered unterline"]:
					hi = copy.deepcopy(le)
					hi.tag = faust.ns("tei") + "hi"
					le.clear()
					for attr in list(hi.attrib.keys()):
						if attr == "rend": continue
						le.set(attr, hi.get(attr))
						del hi.attrib[attr]
					le.append(hi)
				le.tag = faust.ns("ge") + "line"
		
		# turn deletions into <f:st/> by default
		for del_xml in root.iter(faust.ns("tei") + "del"):
			del_xml.tag = faust.ns("f") + "st"
			del_type = del_xml.get("rend", "")
			if del_type == "strikethrough" or del_type == "strikedthrough": 
				del del_xml.attrib["rend"]
			
		# rename tags for fixations
		for rewrite_tag in ("fix", "repetition"):
			for rewrite in root.iter(faust.ns("tei") + rewrite_tag):
				rewrite.tag = faust.ns("ge") + "rewrite"


		# rename semantic tags with @rend="underline"
		for sem_hi_tag in ("emph", "name"):
			for sem_hi in root.iter(faust.ns("tei") + sem_hi_tag):
				if sem_hi.get("rend", "") == "underline":
					sem_hi.tag = faust.ns("tei") + "hi"
		
		# convert umlaut corrections
		umlaut_mapping = { 
			u"ä":u"a", u"Ä":u"A", 
			u"ö":u"o", u"Ö":u"O", 
			u"ü":u"u", u"Ü":u"U" 
			}
		corr_or_reg = itertools.chain(root.iter(faust.ns("tei") + "corr"), root.iter(faust.ns("tei") + "reg"))
		for element in corr_or_reg:
			for umlaut in umlaut_mapping:
				if element.text == umlaut:
					element.text = umlaut_mapping[umlaut]
					element.tag = faust.ns("tei") + "orig"
					
		# join lines with @rend='inline'
		for inline_line in list(faust.xpath(".//ge:line[@rend='inline']", root)):
			prev_lines = faust.xpath("./preceding::ge:line", inline_line)
			if len(prev_lines) == 0: continue
			prev_line = prev_lines[-1]
			
			if inline_line.text is None: 
				inline_line.text = " "
			else:
				inline_line.text += " "				
			inline_line.getparent().remove(inline_line)
			prev_line.append(inline_line)
			lxml.etree.strip_tags(prev_line, faust.ns("ge") + "line")
			
		# convert inline <lb/> to <ge:line/>
		for lb in list(root.iter(faust.ns("tei") + "lb")):
			parent = lb.getparent()
			if parent.tag != (faust.ns("ge") + "line"): continue

			lb.tag = faust.ns("ge") + "line"
			lb.text = lb.tail
			lb.tail = None
			sibling = lb.getnext()
			while sibling is not None:
				next_sibling = sibling.getnext()
				parent.remove(sibling)
				lb.append(sibling)
				sibling = next_sibling			
			parent.remove(lb)
			parent.addnext(lb)
		
		# put <note/> in zones		
		for note in list(root.iter(faust.ns("tei") + "note")):
			parent = surface
			if len(faust.xpath(".//ge:line", note)) == 0:
				parent = lxml.etree.SubElement(parent, faust.ns("tei") + "zone")
				note.tag = faust.ns("ge") + "line"
			else:
				note.tag = faust.ns("tei") + "zone"
			note.getparent().remove(note)			
			parent.append(note)
			if "place" in note.attrib: del note.attrib["place"]


		# u<ex>nd</ex> becomes "und"
		for ex in root.iter(faust.ns("tei") + "ex"):
			try: pre_sibling = ex.itersiblings(preceding=True).next()
			except StopIteration: continue
			if pre_sibling.text:
				if re.split("\s+", pre_sibling.text).pop() == "u":
					# only in two files, do nothing
					pass

		# <abbr>u</abbr> becomes "und"
		for abbr in root.iter(faust.ns("tei") + "abbr"):
			if abbr.text == "u":
				tail = abbr.tail
				if tail: abbr.tail = "und" + tail
				else: abbr.tail = "und"
				remove_keep_tail(abbr)
				#abbr.tag = None

		# </ex> outside of <abbr/> becomes <supplied/>
		for ex in root.iter(faust.ns("tei") + "ex"):
			pass
			if not list(ex.iterancestors(faust.ns("tei") + "abbr")):
				ex.tag = faust.ns("tei") + "supplied"

		# <delSpan/> becomes <f:st/>
		for delSpan in root.iter(faust.ns("tei") + "delSpan"):
			delSpan.tag = faust.ns("f") + "st"

		

		# detach marginal elements
		for margin in list(faust.xpath(".//*[@place]", root)):
			place = margin.get("place")
			if place not in ("margin",\
			 	"top", "top-left", "topleft", "top-right", "topright",\
				"bottom", "bottom-left", "bottomleft", "bottom-right", "bottomright"):
				continue

			del margin.attrib["place"]
			parent = margin.getparent()
			
			margin_zone = lxml.etree.Element(faust.ns("tei") + "zone")
			if place.startswith("top"):
				surface.insert(0, margin_zone)
			else:
				surface.append(margin_zone)
				
			margin_parent = margin_zone
			if margin.tag != faust.ns("ge") + "line":
				margin_parent = lxml.etree.SubElement(margin_parent, faust.ns("ge") + "line")
				
			for ancestor in margin.iterancestors(faust.ns("ge") + "line"):
				line_id = ancestor.get(faust.ns("xml") + "id", None)
				if line_id is None:
					xml_id_cnt += 1
					line_id = "line_" + str(xml_id_cnt)
					ancestor.set(faust.ns("xml") + "id", line_id)
				margin_zone.set(faust.ns("f") + "top", "#" + line_id)
				break
			
			parent.remove(margin)
			margin_parent.append(margin)
			
		# detach interlinear additions
		for inter_add in list(faust.xpath(".//tei:add[@place='above' or @place='below']", root)):
			line = None
			for ancestor in inter_add.iterancestors(faust.ns("ge") + "line"):
				line = ancestor
				break
			if line is None: raise Exception(lxml.etree.tostring(inter_add))
			
			adjunct_line = None
			if inter_add.get("place") == "above":
				adjunct_line = line.getprevious()
			else:
				adjunct_line = line.getnext()
			if (adjunct_line is None) or (adjunct_line.tag != (faust.ns("ge") + "line")) or\
				(adjunct_line.get("type", "") != "inter"):
				adjunct_line = lxml.etree.Element(faust.ns("ge") + "line")
				adjunct_line.set("type", "inter")
				if inter_add.get("place") == "above":
					line.addprevious(adjunct_line)
				else:
					line.addnext(adjunct_line)
			
			xml_id_cnt += 1
			anchor_id = "anchor_" + str(xml_id_cnt)
			
			ins_mark = lxml.etree.SubElement(adjunct_line, faust.ns("f") + "ins")
			ins_mark.set(faust.ns("f") + "at", "#" + anchor_id)
			
			ins_mark.tail = inter_add.text
			inter_add.text = None
			inter_add.tag = faust.ns("tei") + "anchor"
			inter_add.set(faust.ns("xml") + "id", anchor_id)
			for child in inter_add.getchildren():
				inter_add.remove(child)
				adjunct_line.append(child)
			del inter_add.attrib["place"]
			
		# remove remaining <add/> elements
		lxml.etree.strip_tags(root, faust.ns("tei") + "add")
		
		# remove <lb/>s, which are located in zones after conversion
		for lb in list(root.iter(faust.ns("tei") + "lb")):
			parent = lb.getparent()
			if parent.tag == (faust.ns("tei") + "zone"):
				parent.remove(lb)

			
		

		# convert some attribute values
		for typed in faust.xpath(".//*[@type='foliation']" , root):
			typed.set("type", "folioNum")
		for typed in faust.xpath(".//*[@type='sigel']" , root):
			typed.set("type", "sig")
		for typed in faust.xpath(".//*[@type='sigil']" , root):
			typed.set("type", "sig")

		# "#_bl", "#_t" u.ä. → "#sc_bl" etc.
		for any_elem in root.iter(tag=lxml.etree.Element):
			for attrib in any_elem.attrib:
				val = any_elem.get(attrib)
				if val.startswith("#_"):
					new_val ="#sc" + val[1:]
					any_elem.set(attrib, new_val)
				
		#remove type attributes for certain values
		for typed in faust.xpath(".//*[@type='instant' or @type='inst' or @type='instantrevision'\
		or @type='late' or @type='soon']" , root):
			del typed.attrib["type"]

		
		# ***** Textual transcript *****
		
		for text in faust.xpath(".//tei:text", xml):

			# remove hand attribute
			for hand_attributed in faust.xpath(".//*[@hand]", text):
				del hand_attributed.attrib["hand"]
			#remove handShifts
			lxml.etree.strip_tags(text, faust.ns("tei") + "handShift")

			# convert umlaut corrections
			corr_or_reg = itertools.chain(text.iter(faust.ns("tei") + "corr"), root.iter(faust.ns("tei") + "reg"))
			for element in corr_or_reg:
				for umlaut in umlaut_mapping:
					if element.text == umlaut:
						element.text = umlaut_mapping[umlaut]
						element.tag = faust.ns("tei") + "orig"

		# write the converted file
		path = ("conversion_test/" + faust.relative_path(xml_file)).split("/")
		path[-1] = "conv_" + path[-1]
		dir_path = "/".join(path[:-1])
		if not os.path.isdir(dir_path): os.makedirs(dir_path)

		xml.write("/".join(path), encoding="UTF-8")
#

import sys

import lxml.etree

import faust

# XPath expression for extracting the revision history from TEI documents
ge_doc_xp = faust.xpath("normalize-space(//ge:document)")

# XPath expression for extracting the revision history from TEI documents
change_xp = faust.xpath("//tei:teiHeader//tei:revisionDesc/tei:change")

# iterate over all TEI documents
for xml_file in faust.xml_files():
	status = None
	try:
		if faust.is_tei_document(xml_file):
			xml = lxml.etree.parse(xml_file)
			if len(ge_doc_xp(xml).strip()) == 0: continue
			
			encoded = False
			for change in change_xp(xml):
				change_str = lxml.etree.tostring(change).lower()
				if "encoded" in change_str: encoded = True
			if not encoded:
				print faust.relative_path(xml_file)
				
	except IOError:
		sys.stderr.write("I/O error while extracting status from " + xml_file + "\n")
Esempio n. 11
0
ignored_empty_elems = ("addSpan", "anchor", "cb", "certainty", "damageSpan",
                       "delSpan", "gap", "grBrace", "grLine", "handShift",
                       "ins", "join", "lb", "pb", "space", "st", "undo", "p")

element_selector_xp = faust.xpath(
    "//*[(ancestor::tei:text or ancestor::ge:document) and not(@xml:space)]")
text_content_xp = faust.xpath("normalize-space()")

candidates = dict()
for xml_file in faust.xml_files():
    try:
        if faust.is_tei_document(xml_file):
            xml = lxml.etree.parse(xml_file)

            xml_key = faust.relative_path(xml_file)
            candidates[xml_key] = []

            for elem in element_selector_xp(xml):
                if elem.tag.startswith(faust.ns("svg")): continue

                local_name = elem.tag[elem.tag.rfind("}") + 1:]
                if local_name in ignored_tags: continue

                empty_elem = elem.text is None and len(elem) == 0
                if empty_elem and local_name in ignored_empty_elems: continue

                text_content = text_content_xp(elem)
                if empty_elem or (len(text_content) > 0
                                  and len(text_content.strip()) == 0):
                    candidates[xml_key].append(lxml.etree.tostring(elem))