def print_statistics(): ''' Print the result of some queries. ''' kodiert = matches(faust.transcript_files(), kodiert_xp) encoded = matches(faust.transcript_files(), encoded_xp) kodiert_and_encoded = matches(faust.transcript_files(), kodiert_and_encoded_xp) print "kodiert: ", len(kodiert) print "encoded: ", len(encoded) print "kodiert und encoded: ", len(kodiert_and_encoded)
def invalid_facsimile_links(): ''' Print all files with invalid facsimile links''' faust_facsimiles = faust.facsimiles() def facs_invalid(file): xml = lxml.etree.parse(file) urls = faust.xpath("//tei:facsimile/tei:graphic/@url")(xml) for url in urls: if url in faust_facsimiles: return True return False return filter(facs_invalid, faust.transcript_files())
def correct_graphic_uris(): # take into account old GSA files files = [f for f in faust.transcript_files() if '/gsa/' in f] files.extend(xml_names_from_facsimiles()) for f in files: rewrite_file = False try: xml = lxml.etree.parse(f) except IOError: # these should only be GSA files print "(", f, " doesn't exist)" continue print f graphics = graphic_xp(xml) if len(graphics) == 0: append_facsimile_element(xml) # find the newly appended element graphics = graphic_xp(xml) brutal = False if len(graphics) == 1: brutal = True for graphic in graphics: old = graphic.attrib["url"] new = correct_uri(old, brutal, f) graphic.attrib["url"] = new if new != old: print " correcting: ", old, " -> ", new rewrite_file = True if rewrite_file: rev_desc.add_change(xml, "system", "facsimile_adapted") print " writing" faust.tei_serialize(xml).write(f, encoding='UTF-8') else: print " not writing"
def count(): # status counters status_keys = [key for (key, value) in faust.config.items("log-status")] status_dict = {} for key in status_keys: status_dict[key] = 0 status_keys.sort() status_unknown = 0 # iterate over all TEI documents for xml_file in faust.transcript_files(): status = set() try: if faust.is_tei_document(xml_file): xml = lxml.etree.parse(xml_file) # iterate over all change records, searching for a status remark and select the last one for change in change_xp(xml): change_str = lxml.etree.tostring(change).lower().strip() for candidate in [key.strip() for key in status_keys]: if candidate in change_str: status.add(candidate) except IOError: sys.stderr.write("I/O error while extracting status from " + xml_file + "\n") except lxml.etree.XMLSyntaxError: sys.stderr.write("XML error while extracting status from " + xml_file + "\n") if len(status) == 0: # no status given status_unknown += 1 else: for s in status: # increment relevant status entry status_dict[s] += 1 return status_dict, status_unknown
import query, faust, os.path, sys, os, shutil from bn import relpath def destination(file): rel_f = relpath(file, faust.xml_dir) return os.path.join(faust.xml_dir, 'attic', rel_f) if __name__ == "__main__": deleatur_transcripts = query.matches(faust.transcript_files(), query.deleatur_xp) if '-e' in sys.argv: print "executing" for f in deleatur_transcripts: print ' ' + f print '-->' + destination(f) print '' dest_dir = os.path.dirname(destination(f)) print dest_dir if not os.path.isdir(dest_dir): os.makedirs(dest_dir) shutil.move (f, destination(f)) else: for f in deleatur_transcripts: print ' ' + f print '-->' + destination(f) print '' print "To execute, call with -e option"
def documentary_by_name(): textual = re.compile(r"transcript/.*/(.*)/\1.xml") def d_b_n(file): rel = faust.relative_path(file) return not textual.match(rel) return filter(d_b_n, faust.transcript_files())
#X u"//text()[contains(.,'\u231e')]", #8990 #X u"//text()[contains(.,'\u231f')]", #8991 #X u"//text()[contains(.,'\u2609')]", #9737 #X u"//text()[contains(.,'\u263d')]", #9789 u"//text()[contains(.,'\u0a50')]", #2640 #X u"//text()[contains(.,'\u2e13')]", #11795 #X u"//text()[contains(.,'\u2713')]", #10003 #X u"//text()[contains(.,'\u002B')]", #X u"//text()[contains(.,'\u274C')]", # u"//text()[contains(.,'\u00bc')]", # 188 # u"//text()[contains(.,'\u00bd')]", # 189 # u"//text()[contains(.,'\u00be')]", # 190 u"//text()[contains(.,'\u0391')]", # 913 u"//text()[contains(.,'\u0392')]", # 914 u"//text()[contains(.,'\u0393')]", # 915 u"//text()[contains(.,'\u0394')]", # 916 u"//text()[contains(.,'\u03b1')]", # 945 u"//text()[contains(.,'\u03b2')]", # 946 u"//text()[contains(.,'\u02e0')]", # 736 u"//text()[contains(.,'\u03b4')]", # 948 "//tei:change//comment()[contains(.,'Ritschel')]", "//tei:change[@who='ritschel']" ] bad_markup_disjunction = ' | '.join(bad_markup) all_documents = set(map (os.path.dirname, faust.transcript_files())) # print bad_markup_disjunction.encode('utf-8') bad_documents = set(map (os.path.dirname, matches(faust.transcript_files(), bad_markup_disjunction))) for d in all_documents.difference(bad_documents): print '/'.join(d.split('/')[-2:])
def delete_empty_text_elements(): files = query.matches(faust.transcript_files(), "//tei:text[not(.//text() or //tei:div[@type='template' or .//comment()])]") xslt_trans = lxml.etree.XSLT(lxml.etree.parse("xsl/delete_empty_text_elements.xsl")) del_txt = lambda t: tei_transform(t, xslt_trans) transform_all(files, del_txt)
def transform_stages_to_changes(): xslt_trans = lxml.etree.XSLT(lxml.etree.parse("xsl/changes_to_stages.xsl")) changes_to_stages = lambda t: tei_transform(t, xslt_trans) transform_all(faust.transcript_files(), changes_to_stages)
import data_curation.query import faust #inscriptions_in_macrogenetic_files = data_curation.query.unique_values (faust.macrogenesis_files(), # '//f:item/@uri[contains(.,"i_")]') #inscriptions_in_transcripts = data_curation.query.unique_values (faust.transcript_files(), # '//ge:stageNote/@xml:id[contains(.,"i_")]') #bibliographic_sources_in_macrogenetic_files = data_curation.query.unique_values (faust.macrogenesis_files(), # '//f:source/@uri') for f in faust.transcript_files(): inscriptions = data_curation.query.unique_values ([f], '//ge:stageNote/@xml:id[contains(.,"i_")]') for i in inscriptions: print '%s/%s' % (f, i) #print #print 'Inscriptions in macrogenetic files: %i' % len(inscriptions_in_macrogenetic_files) #for value in inscriptions_in_macrogenetic_files: print value #print 'Inscriptions in transcript files: %i' % len(inscriptions_in_transcripts) #for value in inscriptions_in_transcripts: print value # need to prefix inscriptions_in_transcripts with transcript uri for this to work #print 'References to i.s in macrogentic files without referent: %i' % len(inscriptions_in_macrogenetic_files - inscriptions_in_transcripts) #print #print 'Biligraphic sources in macrogenetic files: %i' % len(bibliographic_sources_in_macrogenetic_files)