Ejemplo n.º 1
0
def print_statistics():
	''' Print the result of some queries. '''
	kodiert =  matches(faust.transcript_files(), kodiert_xp)
	encoded =  matches(faust.transcript_files(), encoded_xp)
	kodiert_and_encoded =  matches(faust.transcript_files(), kodiert_and_encoded_xp)

	print "kodiert: ", len(kodiert)
	print "encoded: ", len(encoded)
	print "kodiert und encoded: ", len(kodiert_and_encoded)
Ejemplo n.º 2
0
Archivo: query.py Proyecto: wmbr/app
def print_statistics():
	''' Print the result of some queries. '''
	kodiert =  matches(faust.transcript_files(), kodiert_xp)
	encoded =  matches(faust.transcript_files(), encoded_xp)
	kodiert_and_encoded =  matches(faust.transcript_files(), kodiert_and_encoded_xp)

	print "kodiert: ", len(kodiert)
	print "encoded: ", len(encoded)
	print "kodiert und encoded: ", len(kodiert_and_encoded)
Ejemplo n.º 3
0
def invalid_facsimile_links():
	''' Print all files with invalid facsimile links'''
	faust_facsimiles = faust.facsimiles()
	def facs_invalid(file):
		xml = lxml.etree.parse(file)
		urls = faust.xpath("//tei:facsimile/tei:graphic/@url")(xml)
		for url in urls:
			if url in faust_facsimiles: return True
		return False
	return filter(facs_invalid, faust.transcript_files())
Ejemplo n.º 4
0
Archivo: query.py Proyecto: wmbr/app
def invalid_facsimile_links():
	''' Print all files with invalid facsimile links'''
	faust_facsimiles = faust.facsimiles()
	def facs_invalid(file):
		xml = lxml.etree.parse(file)
		urls = faust.xpath("//tei:facsimile/tei:graphic/@url")(xml)
		for url in urls:
			if url in faust_facsimiles: return True
		return False
	return filter(facs_invalid, faust.transcript_files())
Ejemplo n.º 5
0
def correct_graphic_uris():
    # take into account old GSA files
    files = [f for f in faust.transcript_files() if '/gsa/' in f]
    files.extend(xml_names_from_facsimiles())
    for f in files:
        rewrite_file = False

        try:
            xml = lxml.etree.parse(f)
        except IOError:
            # these should only be GSA files
            print "(", f, " doesn't exist)"
            continue
        print f
        graphics = graphic_xp(xml)

        if len(graphics) == 0:
            append_facsimile_element(xml)
            # find the newly appended element
            graphics = graphic_xp(xml)

        brutal = False
        if len(graphics) == 1:
            brutal = True

        for graphic in graphics:
            old = graphic.attrib["url"]
            new = correct_uri(old, brutal, f)
            graphic.attrib["url"] = new
            if new != old:
                print "   correcting: ", old, " -> ", new
                rewrite_file = True
        if rewrite_file:
            rev_desc.add_change(xml, "system", "facsimile_adapted")
            print "   writing"
            faust.tei_serialize(xml).write(f, encoding='UTF-8')
        else:
            print "   not writing"
def correct_graphic_uris():
	# take into account old GSA files
	files = [f for f in faust.transcript_files() if '/gsa/' in f]
	files.extend(xml_names_from_facsimiles())
	for f in files:
		rewrite_file = False
	
		try:
			xml = lxml.etree.parse(f)
		except IOError:
			# these should only be GSA files
			print "(", f, " doesn't exist)"
			continue
		print f
		graphics = graphic_xp(xml)

		if len(graphics) == 0:
			append_facsimile_element(xml)
			# find the newly appended element
			graphics = graphic_xp(xml)

		brutal = False
		if len(graphics) == 1:
			brutal = True

		for graphic in graphics:
			old = graphic.attrib["url"]
			new = correct_uri(old, brutal, f)
			graphic.attrib["url"] = new
			if new != old:
				print "   correcting: ", old, " -> ", new
				rewrite_file = True
		if rewrite_file:
			rev_desc.add_change(xml, "system", "facsimile_adapted")
			print "   writing"
			faust.tei_serialize(xml).write(f, encoding='UTF-8')
		else:
			print "   not writing"
Ejemplo n.º 7
0
def count():
    # status counters
    status_keys = [key for (key, value) in faust.config.items("log-status")]
    status_dict = {}
    for key in status_keys:
        status_dict[key] = 0

    status_keys.sort()

    status_unknown = 0

    # iterate over all TEI documents
    for xml_file in faust.transcript_files():
        status = set()
        try:
            if faust.is_tei_document(xml_file):
                xml = lxml.etree.parse(xml_file)

                # iterate over all change records, searching for a status remark and select the last one
                for change in change_xp(xml):
                    change_str = lxml.etree.tostring(change).lower().strip()
                    for candidate in [key.strip() for key in status_keys]:
                        if candidate in change_str: status.add(candidate)
        except IOError:
            sys.stderr.write("I/O error while extracting status from " +
                             xml_file + "\n")
        except lxml.etree.XMLSyntaxError:
            sys.stderr.write("XML error while extracting status from " +
                             xml_file + "\n")

        if len(status) == 0:
            # no status given
            status_unknown += 1
        else:
            for s in status:
                # increment relevant status entry
                status_dict[s] += 1
    return status_dict, status_unknown
def count():
	# status counters
	status_keys = [key for (key, value) in faust.config.items("log-status")]
	status_dict = {}
	for key in status_keys:
		status_dict[key] = 0

	status_keys.sort()

	status_unknown = 0

	# iterate over all TEI documents
	for xml_file in faust.transcript_files():
		status = set()
		try:
			if faust.is_tei_document(xml_file):
				xml = lxml.etree.parse(xml_file)

				# iterate over all change records, searching for a status remark and select the last one
				for change in change_xp(xml):
					change_str = lxml.etree.tostring(change).lower().strip()
					for candidate in [key.strip() for key in status_keys]:
						if candidate in change_str: status.add(candidate)
		except IOError:
			sys.stderr.write("I/O error while extracting status from " + xml_file + "\n")
		except lxml.etree.XMLSyntaxError:
			sys.stderr.write("XML error while extracting status from " + xml_file + "\n")

		if len(status) == 0:
			# no status given
			status_unknown += 1
		else:
			for s in status:
				# increment relevant status entry
				status_dict[s] += 1
	return status_dict, status_unknown
Ejemplo n.º 9
0
import query, faust, os.path, sys, os, shutil
from bn import relpath

def destination(file):
    rel_f = relpath(file, faust.xml_dir)
    return os.path.join(faust.xml_dir, 'attic', rel_f)

if __name__ == "__main__":

    deleatur_transcripts = query.matches(faust.transcript_files(), query.deleatur_xp)

    if '-e' in sys.argv:
        print "executing"
        for f in deleatur_transcripts:
            print '         ' + f
            print '-->' + destination(f)
            print ''

            dest_dir = os.path.dirname(destination(f))
            print dest_dir
            if not os.path.isdir(dest_dir):
                os.makedirs(dest_dir)
            shutil.move (f, destination(f))
    else:
        for f in deleatur_transcripts:
            print '         ' + f
            print '-->' + destination(f)
            print ''

        print "To execute, call with -e option"
Ejemplo n.º 10
0
def documentary_by_name():
	textual = re.compile(r"transcript/.*/(.*)/\1.xml")
	def d_b_n(file):
		rel = faust.relative_path(file)
		return not textual.match(rel)
	return filter(d_b_n, faust.transcript_files())
Ejemplo n.º 11
0
            #X u"//text()[contains(.,'\u231e')]", #8990
            #X u"//text()[contains(.,'\u231f')]", #8991
            #X u"//text()[contains(.,'\u2609')]", #9737
            #X u"//text()[contains(.,'\u263d')]", #9789
            u"//text()[contains(.,'\u0a50')]", #2640
            #X u"//text()[contains(.,'\u2e13')]", #11795
            #X u"//text()[contains(.,'\u2713')]", #10003
            #X u"//text()[contains(.,'\u002B')]",
            #X u"//text()[contains(.,'\u274C')]",
            # u"//text()[contains(.,'\u00bc')]", # 188
            # u"//text()[contains(.,'\u00bd')]", # 189
            # u"//text()[contains(.,'\u00be')]", # 190
            u"//text()[contains(.,'\u0391')]", # 913
            u"//text()[contains(.,'\u0392')]", # 914
            u"//text()[contains(.,'\u0393')]", # 915
            u"//text()[contains(.,'\u0394')]", # 916
            u"//text()[contains(.,'\u03b1')]", # 945
            u"//text()[contains(.,'\u03b2')]", # 946
            u"//text()[contains(.,'\u02e0')]", # 736
            u"//text()[contains(.,'\u03b4')]", # 948
            "//tei:change//comment()[contains(.,'Ritschel')]",
            "//tei:change[@who='ritschel']"
        ]
	bad_markup_disjunction = ' | '.join(bad_markup)
	all_documents = set(map (os.path.dirname, faust.transcript_files()))
	# print bad_markup_disjunction.encode('utf-8')
	bad_documents = set(map (os.path.dirname, matches(faust.transcript_files(), bad_markup_disjunction)))

	for d in all_documents.difference(bad_documents):
	 	print '/'.join(d.split('/')[-2:])
Ejemplo n.º 12
0
Archivo: query.py Proyecto: wmbr/app
def documentary_by_name():
	textual = re.compile(r"transcript/.*/(.*)/\1.xml")
	def d_b_n(file):
		rel = faust.relative_path(file)
		return not textual.match(rel)
	return filter(d_b_n, faust.transcript_files())
Ejemplo n.º 13
0
def delete_empty_text_elements():
	files = query.matches(faust.transcript_files(),
						  "//tei:text[not(.//text() or //tei:div[@type='template' or .//comment()])]")
	xslt_trans = lxml.etree.XSLT(lxml.etree.parse("xsl/delete_empty_text_elements.xsl"))
	del_txt = lambda t: tei_transform(t, xslt_trans)
	transform_all(files, del_txt)
Ejemplo n.º 14
0
def transform_stages_to_changes():
	xslt_trans = lxml.etree.XSLT(lxml.etree.parse("xsl/changes_to_stages.xsl"))
	changes_to_stages = lambda t: tei_transform(t, xslt_trans)
	transform_all(faust.transcript_files(), changes_to_stages)
import data_curation.query
import faust



#inscriptions_in_macrogenetic_files = data_curation.query.unique_values (faust.macrogenesis_files(),
#                                                                        '//f:item/@uri[contains(.,"i_")]')
#inscriptions_in_transcripts = data_curation.query.unique_values (faust.transcript_files(),
#                                                                        '//ge:stageNote/@xml:id[contains(.,"i_")]')
#bibliographic_sources_in_macrogenetic_files = data_curation.query.unique_values (faust.macrogenesis_files(),
#                                                                        '//f:source/@uri')



for f in faust.transcript_files():
    inscriptions = data_curation.query.unique_values ([f], '//ge:stageNote/@xml:id[contains(.,"i_")]')
    for i in inscriptions:
        print '%s/%s' % (f, i)




#print
#print 'Inscriptions in macrogenetic files:                      %i' % len(inscriptions_in_macrogenetic_files)
#for value in inscriptions_in_macrogenetic_files: print value
#print 'Inscriptions in transcript files:                        %i' % len(inscriptions_in_transcripts)
#for value in inscriptions_in_transcripts: print value
# need to prefix inscriptions_in_transcripts with transcript uri for this to work
#print 'References to i.s in macrogentic files without referent: %i' % len(inscriptions_in_macrogenetic_files - inscriptions_in_transcripts)
#print
#print 'Biligraphic sources in macrogenetic files:               %i' % len(bibliographic_sources_in_macrogenetic_files)