コード例 #1
ファイル: transform.py プロジェクト: wmbr/app
def tei_transform (tei_file, transform_etree):
		if not faust.is_tei_document(tei_file):
			sys.stderr.write("Not a TEI file: " + file + "\n")
		xml = lxml.etree.parse(tei_file)
		result = transform_etree(xml)
		faust.tei_serialize(result).write(tei_file, encoding="UTF-8")
	except IOError:
		sys.stderr.write("I/O error while transforming " + tei_file + "\n")
	except lxml.etree.XMLSyntaxError:
		sys.stderr.write("XML syntax error while transforming " + tei_file + "\n")
コード例 #2
def to_convert():
	text_content_xp = faust.xpath("normalize-space(//tei:text)")
	ge_document_content_xp = faust.xpath("normalize-space(//ge:document)")
	has_text_in = lambda xp, xml: (len(" ".join(xp(xml)).strip()) > 0)
	to_convert = list()
	for xml_file in faust.xml_files():
		path = faust.relative_path(xml_file).split("/")
		if path[0] != "transcript": continue	
		file_name = path[-1]
		if file_name[:-len(".xml")] == path[-2]: continue
		if int(re.search(r'[0-9]+', file_name).group(0)) == 1: continue
		if not faust.is_tei_document(xml_file):	continue
		xml = lxml.etree.parse(xml_file)
		if has_text_in(text_content_xp, xml) and not has_text_in(ge_document_content_xp, xml):
	return to_convert
コード例 #3
ファイル: report_encoding_status.py プロジェクト: wmbr/app
def count():
    # status counters
    status_keys = [key for (key, value) in faust.config.items("log-status")]
    status_dict = {}
    for key in status_keys:
        status_dict[key] = 0


    status_unknown = 0

    # iterate over all TEI documents
    for xml_file in faust.transcript_files():
        status = set()
            if faust.is_tei_document(xml_file):
                xml = lxml.etree.parse(xml_file)

                # iterate over all change records, searching for a status remark and select the last one
                for change in change_xp(xml):
                    change_str = lxml.etree.tostring(change).lower().strip()
                    for candidate in [key.strip() for key in status_keys]:
                        if candidate in change_str: status.add(candidate)
        except IOError:
            sys.stderr.write("I/O error while extracting status from " +
                             xml_file + "\n")
        except lxml.etree.XMLSyntaxError:
            sys.stderr.write("XML error while extracting status from " +
                             xml_file + "\n")

        if len(status) == 0:
            # no status given
            status_unknown += 1
            for s in status:
                # increment relevant status entry
                status_dict[s] += 1
    return status_dict, status_unknown
コード例 #4
def count():
	# status counters
	status_keys = [key for (key, value) in faust.config.items("log-status")]
	status_dict = {}
	for key in status_keys:
		status_dict[key] = 0


	status_unknown = 0

	# iterate over all TEI documents
	for xml_file in faust.transcript_files():
		status = set()
			if faust.is_tei_document(xml_file):
				xml = lxml.etree.parse(xml_file)

				# iterate over all change records, searching for a status remark and select the last one
				for change in change_xp(xml):
					change_str = lxml.etree.tostring(change).lower().strip()
					for candidate in [key.strip() for key in status_keys]:
						if candidate in change_str: status.add(candidate)
		except IOError:
			sys.stderr.write("I/O error while extracting status from " + xml_file + "\n")
		except lxml.etree.XMLSyntaxError:
			sys.stderr.write("XML error while extracting status from " + xml_file + "\n")

		if len(status) == 0:
			# no status given
			status_unknown += 1
			for s in status:
				# increment relevant status entry
				status_dict[s] += 1
	return status_dict, status_unknown
コード例 #5
ignored_tags = (
	"app", "back", "body", "choice", "div", "docTitle", "fix", "front", "fw", "g", 
	"group", "lg", "overw", "patch", "sp", "subst", "surface", "text", "titlePage", "titlePart", 
	"used", "zone")
ignored_empty_elems = (
	"addSpan", "anchor", "cb", "certainty", "damageSpan", "delSpan", "gap", "grBrace", "grLine", "handShift", 
	"ins", "join", "lb", "pb", "space", "st", "undo", "p")

element_selector_xp = faust.xpath("//*[(ancestor::tei:text or ancestor::ge:document) and not(@xml:space)]")
text_content_xp = faust.xpath("normalize-space()")

candidates = dict()
for xml_file in faust.xml_files():
		if faust.is_tei_document(xml_file):
			xml = lxml.etree.parse(xml_file)
			xml_key = faust.relative_path(xml_file)
			candidates[xml_key] = []
			for elem in element_selector_xp(xml):
				if elem.tag.startswith(faust.ns("svg")): continue
				local_name = elem.tag[elem.tag.rfind("}") + 1:]
				if local_name in ignored_tags: continue
				empty_elem = elem.text is None and len(elem) == 0
				if empty_elem and local_name in ignored_empty_elems: continue
				text_content = text_content_xp(elem)
コード例 #6
ファイル: count_unencoded_documents.py プロジェクト: wmbr/app
import sys

import lxml.etree

import faust

# XPath expression for extracting the revision history from TEI documents
ge_doc_xp = faust.xpath("normalize-space(//ge:document)")

# XPath expression for extracting the revision history from TEI documents
change_xp = faust.xpath("//tei:teiHeader//tei:revisionDesc/tei:change")

# iterate over all TEI documents
for xml_file in faust.xml_files():
    status = None
        if faust.is_tei_document(xml_file):
            xml = lxml.etree.parse(xml_file)
            if len(ge_doc_xp(xml).strip()) == 0: continue

            encoded = False
            for change in change_xp(xml):
                change_str = lxml.etree.tostring(change).lower()
                if "encoded" in change_str: encoded = True
            if not encoded:
                print faust.relative_path(xml_file)

    except IOError:
        sys.stderr.write("I/O error while extracting status from " + xml_file +
コード例 #7
ファイル: update_tei_docs.py プロジェクト: wmbr/app
# Get the template and parse it
tei_template = faust.absolute_path("template/tei.xml")
template = lxml.etree.parse(tei_template)

# extract relevant header fragments from template
template_hand_notes = handNotes_xp(template)[0]
template_char_decl = charDecl_xp(template)[0]

def replace(node, with_node):
	'''Replaces a node with a deep copy of a node (from another document)'''
	node.getparent().replace(node, copy.deepcopy(with_node))

# iterate over TEI files (excluding the template)
for xml_file in faust.xml_files():
		if (xml_file != tei_template) and faust.is_tei_document(xml_file):
			xml = lxml.etree.parse(xml_file)
			# replace header fragments
			for hand_notes in handNotes_xp(xml): replace(hand_notes, template_hand_notes)
			for char_decl in charDecl_xp(xml): replace(char_decl, template_char_decl)
			# write back updated document
			faust.tei_serialize(xml).write(xml_file, encoding="UTF-8")
	except IOError:
		sys.stderr.write("I/O error while updating " + xml_file + "\n")
	except lxml.etree.XMLSyntaxError:
		sys.stderr.write("XML syntax error while updating " + xml_file + "\n")