Exemple #1
0
def to_convert():
	text_content_xp = faust.xpath("normalize-space(//tei:text)")
	ge_document_content_xp = faust.xpath("normalize-space(//ge:document)")
	has_text_in = lambda xp, xml: (len(" ".join(xp(xml)).strip()) > 0)
	
	to_convert = list()
	for xml_file in faust.xml_files():
		path = faust.relative_path(xml_file).split("/")
		if path[0] != "transcript": continue	
		file_name = path[-1]
		if file_name[:-len(".xml")] == path[-2]: continue
		if int(re.search(r'[0-9]+', file_name).group(0)) == 1: continue
		if not faust.is_tei_document(xml_file):	continue
	
		xml = lxml.etree.parse(xml_file)
		if has_text_in(text_content_xp, xml) and not has_text_in(ge_document_content_xp, xml):
			to_convert.append(xml_file)
	return to_convert
import faust

ignored_tags = (
	"app", "back", "body", "choice", "div", "docTitle", "fix", "front", "fw", "g", 
	"group", "lg", "overw", "patch", "sp", "subst", "surface", "text", "titlePage", "titlePart", 
	"used", "zone")
	
ignored_empty_elems = (
	"addSpan", "anchor", "cb", "certainty", "damageSpan", "delSpan", "gap", "grBrace", "grLine", "handShift", 
	"ins", "join", "lb", "pb", "space", "st", "undo", "p")

element_selector_xp = faust.xpath("//*[(ancestor::tei:text or ancestor::ge:document) and not(@xml:space)]")
text_content_xp = faust.xpath("normalize-space()")

candidates = dict()
for xml_file in faust.xml_files():
	try:
		if faust.is_tei_document(xml_file):
			xml = lxml.etree.parse(xml_file)
			
			xml_key = faust.relative_path(xml_file)
			candidates[xml_key] = []
			
			for elem in element_selector_xp(xml):
				if elem.tag.startswith(faust.ns("svg")): continue
				
				local_name = elem.tag[elem.tag.rfind("}") + 1:]
				if local_name in ignored_tags: continue
				
				empty_elem = elem.text is None and len(elem) == 0
				if empty_elem and local_name in ignored_empty_elems: continue
#

import sys

import lxml.etree

import faust

# XPath expression for extracting the revision history from TEI documents
ge_doc_xp = faust.xpath("normalize-space(//ge:document)")

# XPath expression for extracting the revision history from TEI documents
change_xp = faust.xpath("//tei:teiHeader//tei:revisionDesc/tei:change")

# iterate over all TEI documents
for xml_file in faust.xml_files():
    status = None
    try:
        if faust.is_tei_document(xml_file):
            xml = lxml.etree.parse(xml_file)
            if len(ge_doc_xp(xml).strip()) == 0: continue

            encoded = False
            for change in change_xp(xml):
                change_str = lxml.etree.tostring(change).lower()
                if "encoded" in change_str: encoded = True
            if not encoded:
                print faust.relative_path(xml_file)

    except IOError:
        sys.stderr.write("I/O error while extracting status from " + xml_file +