class ZefaniaXMLBible(Bible): """ Class for reading, validating, and converting ZefaniaXMLBible XML. """ XMLNameSpace = "{http://www.w3.org/2001/XMLSchema-instance}" treeTag = 'XMLBIBLE' infoTag = 'INFORMATION' bookTag = 'BIBLEBOOK' chapterTag = 'CHAPTER' captionTag = 'CAPTION' verseTag = 'VERS' noteTag = 'NOTE' styleTag = 'STYLE' breakTag = 'BR' def __init__(self, sourceFolder, givenName, encoding='utf-8'): """ Constructor: just sets up the Zefania Bible object. """ # Setup and initialise the base class first Bible.__init__(self) self.objectNameString = "Zefania XML Bible object" self.objectTypeString = "Zefania" # Now we can set our object variables self.sourceFolder, self.givenName, self.encoding = sourceFolder, givenName, encoding self.sourceFilepath = os.path.join(self.sourceFolder, self.givenName) self.tree = self.header = None # Will hold the XML data # Get the data tables that we need for proper checking #self.ISOLanguages = ISO_639_3_Languages().loadData() self.genericBOS = BibleOrganizationalSystem("GENERIC-KJV-66-ENG") # Do a preliminary check on the readability of our file if not os.access(self.sourceFilepath, os.R_OK): print("ZefaniaXMLBible: File '{}' is unreadable".format( self.sourceFilepath)) self.name = self.givenName #if self.name is None: #pass # end of ZefaniaXMLBible.__init__ def load(self): """ Load a single source XML file and load book elements. """ if Globals.verbosityLevel > 2: print(_("Loading {}...").format(self.sourceFilepath)) self.tree = ElementTree().parse(self.sourceFilepath) if Globals.debugFlag: assert (len(self.tree) ) # Fail here if we didn't load anything at all # Find the main (bible) container if self.tree.tag == ZefaniaXMLBible.treeTag: location = "Zefania XML file" Globals.checkXMLNoText(self.tree, location, '4f6h') Globals.checkXMLNoTail(self.tree, location, '1wk8') schema = None name = status = BibleType = revision = version = lgid = None for attrib, value in self.tree.items(): if attrib == ZefaniaXMLBible.XMLNameSpace + 'noNamespaceSchemaLocation': schema = value elif attrib == "biblename": name = value elif attrib == "lgid": lgid = value # In italian.xml this is set to "german" elif attrib == "status": status = value elif attrib == "type": BibleType = value elif attrib == "revision": revision = value elif attrib == "version": version = value else: logging.warning( "Unprocessed '{}' attribute ({}) in main element". format(attrib, value)) if name: self.name = name if status: self.status = status if revision: self.revision = revision if version: self.version = version if self.tree[0].tag == 'INFORMATION': self.header = self.tree[0] self.tree.remove(self.header) self.__validateAndExtractHeader() else: # Handle information records at the END of the file ix = len(self.tree) - 1 if self.tree[ix].tag == 'INFORMATION': self.header = self.tree[ix] self.tree.remove(self.header) self.__validateAndExtractHeader() # Find the submain (book) containers for element in self.tree: if element.tag == ZefaniaXMLBible.bookTag: sublocation = "book in " + location Globals.checkXMLNoText(element, sublocation, 'g3g5') Globals.checkXMLNoTail(element, sublocation, 'd3f6') self.__validateAndExtractBook(element) else: logging.error("Expected to find '{}' but got '{}'".format( ZefaniaXMLBible.bookTag, element.tag)) else: logging.error("Expected to load '{}' but got '{}'".format( ZefaniaXMLBible.treeTag, self.tree.tag)) self.doPostLoadProcessing() # end of ZefaniaXMLBible.load def __validateAndExtractHeader(self): """ Extracts information out of the header record, such as: <INFORMATION> <title>King James Version</title> <creator></creator> <subject>The Holy Bible</subject> <description>In 1604, King James I of England authorized that a new translation of the Bible into English be started. It was finished in 1611, just 85 years after the first translation of the New Testament into English appeared (Tyndale, 1526). The Authorized Version, or King James Version, quickly became the standard for English-speaking Protestants. Its flowing language and prose rhythm has had a profound influence on the literature of the past 300 years.</description> <publisher>FREE BIBLE SOFTWARE GROUP</publisher> <contributors /> <date>2009-01-23</date> <type>Bible</type> <format>Zefania XML Bible Markup Language</format> <identifier>kjv</identifier> <source>http://www.unboundbible.com/zips/index.cfm?lang=English</source> <language>ENG</language> <coverage>provide the Bible to the nations of the world</coverage> <rights>We believe that this Bible is found in the Public Domain.</rights> </INFORMATION> """ if Globals.debugFlag: assert (self.header) location = 'Header' Globals.checkXMLNoAttributes(self.header, location, 'j4j6') Globals.checkXMLNoText(self.header, location, 'sk4l') Globals.checkXMLNoTail(self.header, location, 'a2d4') # TODO: We probably need to rationalise some of the self.xxx stores for element in self.header: #print( "header", element.tag ) if element.tag == 'title': sublocation = "title in {}".format(location) Globals.checkXMLNoTail(element, sublocation, 'al1d') Globals.checkXMLNoAttributes(element, sublocation, 'j3jd') Globals.checkXMLNoSubelements(element, sublocation, '5g78') if Globals.debugFlag: assert (element.text) self.title = element.text elif element.tag == 'creator': sublocation = "creator in {}".format(location) Globals.checkXMLNoTail(element, sublocation, 'al1d') Globals.checkXMLNoAttributes(element, sublocation, 'j3jd') Globals.checkXMLNoSubelements(element, sublocation, '5g78') if element.text: self.creator = element.text elif element.tag == 'subject': sublocation = "subject in {}".format(location) Globals.checkXMLNoTail(element, sublocation, 'al1d') Globals.checkXMLNoAttributes(element, sublocation, 'j3jd') Globals.checkXMLNoSubelements(element, sublocation, '5g78') if element.text: self.subject = element.text elif element.tag == 'description': sublocation = "description in {}".format(location) Globals.checkXMLNoTail(element, sublocation, 'al1d') Globals.checkXMLNoAttributes(element, sublocation, 'j3jd') Globals.checkXMLNoSubelements(element, sublocation, '5g78') if Globals.debugFlag: assert (element.text) self.description = element.text elif element.tag == 'publisher': sublocation = "publisher in {}".format(location) Globals.checkXMLNoTail(element, sublocation, 'al1d') Globals.checkXMLNoAttributes(element, sublocation, 'j3jd') Globals.checkXMLNoSubelements(element, sublocation, '5g78') if element.text: self.publisher = element.text elif element.tag == 'contributors': sublocation = "contributors in {}".format(location) Globals.checkXMLNoTail(element, sublocation, 'al1d') Globals.checkXMLNoAttributes(element, sublocation, 'j3jd') Globals.checkXMLNoSubelements(element, sublocation, '5g78') if element.text: self.contributors = element.text elif element.tag == 'date': sublocation = "date in {}".format(location) Globals.checkXMLNoTail(element, sublocation, 'al1d') Globals.checkXMLNoAttributes(element, sublocation, 'j3jd') Globals.checkXMLNoSubelements(element, sublocation, '5g78') if Globals.debugFlag: assert (element.text) self.date = element.text elif element.tag == 'type': sublocation = "type in {}".format(location) Globals.checkXMLNoTail(element, sublocation, 'al1d') Globals.checkXMLNoAttributes(element, sublocation, 'j3jd') Globals.checkXMLNoSubelements(element, sublocation, '5g78') if element.text: self.documentType = element.text elif element.tag == 'format': sublocation = "format in {}".format(location) Globals.checkXMLNoTail(element, sublocation, 'al1d') Globals.checkXMLNoAttributes(element, sublocation, 'j3jd') Globals.checkXMLNoSubelements(element, sublocation, '5g78') if Globals.debugFlag: assert (element.text) if Globals.debugFlag: assert ( element.text == 'Zefania XML Bible Markup Language') elif element.tag == 'identifier': sublocation = "identifier in {}".format(location) Globals.checkXMLNoTail(element, sublocation, 'al1d') Globals.checkXMLNoAttributes(element, sublocation, 'j3jd') Globals.checkXMLNoSubelements(element, sublocation, '5g78') if Globals.debugFlag: assert (element.text) self.identifier = element.text elif element.tag == 'source': sublocation = "source in {}".format(location) Globals.checkXMLNoTail(element, sublocation, 'al1d') Globals.checkXMLNoAttributes(element, sublocation, 'j3jd') Globals.checkXMLNoSubelements(element, sublocation, '5g78') if Globals.debugFlag: assert (element.text) self.source = element.text elif element.tag == 'language': sublocation = "language in {}".format(location) Globals.checkXMLNoTail(element, sublocation, 'al1d') Globals.checkXMLNoAttributes(element, sublocation, 'j3jd') Globals.checkXMLNoSubelements(element, sublocation, '5g78') if Globals.debugFlag: assert (element.text) self.language = element.text elif element.tag == 'coverage': sublocation = "coverage in {}".format(location) Globals.checkXMLNoTail(element, sublocation, 'al1d') Globals.checkXMLNoAttributes(element, sublocation, 'j3jd') Globals.checkXMLNoSubelements(element, sublocation, '5g78') if element.text: self.coverage = element.text elif element.tag == 'rights': sublocation = "rights in {}".format(location) Globals.checkXMLNoTail(element, sublocation, 'al1d') Globals.checkXMLNoAttributes(element, sublocation, 'j3jd') Globals.checkXMLNoSubelements(element, sublocation, '5g78') if element.text: self.rights = element.text else: logging.error("Found unexpected '{}' tag in {}".format( element.tag, location)) # end of ZefaniaXMLBible.__validateAndExtractHeader def __validateAndExtractBook(self, book): """ Check/validate and extract book data from the given XML book record finding chapter subelements. """ if Globals.verbosityLevel > 3: print(_("Validating XML book...")) # Process the div attributes first BBB = bookName = bookShortName = bookNumber = None for attrib, value in book.items(): if attrib == "bnumber": bookNumber = value elif attrib == "bname": bookName = value elif attrib == "bsname": bookShortName = value else: logging.warning( "Unprocessed '{}' attribute ({}) in book element".format( attrib, value)) if bookNumber: try: BBB = Globals.BibleBooksCodes.getBBBFromReferenceNumber( bookNumber) except KeyError: logging.warning( "Unable to deduce which book is number={}, name={}, shortName={} -- ignoring it" \ .format( bookNumber, bookName, bookShortName ) ) elif bookName: BBB = self.genericBOS.getBBB(bookName) if BBB: if Globals.verbosityLevel > 2: print(_("Validating {} {}...").format(BBB, bookName)) thisBook = BibleBook(self.name, BBB) thisBook.objectNameString = "Zefania XML Bible Book object" thisBook.objectTypeString = "Zefania" #thisBook.sourceFilepath = self.sourceFilepath for element in book: if element.tag == ZefaniaXMLBible.chapterTag: sublocation = "chapter in {}".format(BBB) Globals.checkXMLNoText(element, sublocation, 'j3jd') Globals.checkXMLNoTail(element, sublocation, 'al1d') self.__validateAndExtractChapter(BBB, thisBook, element) else: logging.error("Expected to find '{}' but got '{}'".format( ZefaniaXMLBible.chapterTag, element.tag)) if Globals.verbosityLevel > 2: print(" Saving {} into results...".format(BBB)) self.saveBook(thisBook) # end of ZefaniaXMLBible.__validateAndExtractBook def __validateAndExtractChapter(self, BBB, thisBook, chapter): """ Check/validate and extract chapter data from the given XML book record finding and saving chapter numbers and finding and saving verse elements. """ if Globals.verbosityLevel > 3: print(_("Validating XML chapter...")) # Process the chapter attributes first chapterNumber = numVerses = None for attrib, value in chapter.items(): if attrib == "cnumber": chapterNumber = value else: logging.warning( "Unprocessed '{}' attribute ({}) in chapter element". format(attrib, value)) if chapterNumber: #print( BBB, 'c', chapterNumber ) thisBook.appendLine('c', chapterNumber) else: logging.error( "Missing 'n' attribute in chapter element for BBB".format(BBB)) for element in chapter: if element.tag == ZefaniaXMLBible.verseTag: location = "verse in {} {}".format(BBB, chapterNumber) self.__validateAndExtractVerse(BBB, chapterNumber, thisBook, element) elif element.tag == ZefaniaXMLBible.captionTag: # Used in Psalms location = "caption in {} {}".format(BBB, chapterNumber) Globals.checkXMLNoTail(element, location, 'k5k8') Globals.checkXMLNoSubelements(element, location, 'd3f5') # Handle caption attributes vRef = None for attrib, value in element.items(): if attrib == "vref": vRef = value if Globals.debugFlag: assert (vRef == '1') else: logging.warning( "Unprocessed '{}' attribute ({}) in caption element" .format(attrib, value)) if Globals.debugFlag: assert (vRef) vText = element.text if not vText: logging.warning("{} {}:{} has no text".format( BBB, chapterNumber, vRef)) if vText: # This is the main text of the caption #print( "{} {}:{} '{}'".format( BBB, chapterNumber, verseNumber, vText ) ) thisBook.appendLine('v', '0' + ' ' + vText) # We save it as verse zero else: logging.error("Expected to find '{}' but got '{}'".format( ZefaniaXMLBible.verseTag, element.tag)) # end of ZefaniaXMLBible.__validateAndExtractChapter def __validateAndExtractVerse(self, BBB, chapterNumber, thisBook, verse): """ Check/validate and extract chapter data from the given XML book record finding and saving chapter numbers and finding and saving verse elements. """ if Globals.verbosityLevel > 3: print(_("Validating XML verse...")) location = "verse in {} {}".format(BBB, chapterNumber) Globals.checkXMLNoTail(verse, location, 'l5ks') # Handle verse attributes verseNumber = toVerseNumber = None for attrib, value in verse.items(): if attrib == "vnumber": verseNumber = value else: logging.warning( "Unprocessed '{}' attribute ({}) in verse element".format( attrib, value)) if Globals.debugFlag: assert (verseNumber) location = "{}:{}".format( location, verseNumber) # Get a better location description #thisBook.appendLine( 'v', verseNumber ) vText = verse.text if vText: vText = vText.strip() #if not vText: # This happens if a verse starts immediately with a style or note #logging.warning( "{} {}:{} has no text".format( BBB, chapterNumber, verseNumber ) ) # Handle verse subelements (notes and styled portions) for subelement in verse: if subelement.tag == ZefaniaXMLBible.noteTag: sublocation = "note in " + location noteType = None for attrib, value in subelement.items(): if attrib == "type": noteType = value else: logging.warning( "Unprocessed '{}' attribute ({}) in style subelement" .format(attrib, value)) if noteType not in ( 'n-studynote', 'x-studynote', ): logging.warning("Unexpected {} note type in {}".format( noteType, BBB)) if Globals.debugFlag: assert (noteType) nText, nTail = subelement.text, subelement.tail #print( "note", BBB, chapterNumber, verseNumber, noteType, repr(nText), repr(nTail) ) #thisBook.appendLine( 'ST', css ) # XXXXXXXXXXXXXXXXXXXXXXXXXX Losing data here (for now) #thisBook.appendLine( 'ST=', nText ) if nTail: if '\n' in nTail: print( "ZefaniaXMLBible.__validateAndExtractVerse: nTail {} {}:{} '{}'" .format(BBB, chapterNumber, verseNumber, nTail)) nTail = nTail.replace('\n', ' ') thisBook.appendLine('v~', nTail) for subsubelement in subelement: if subsubelement.tag == ZefaniaXMLBible.styleTag: subsublocation = "style in " + sublocation Globals.checkXMLNoSubelements(subsubelement, subsublocation, 'fyt4') css = idStyle = None for attrib, value in subsubelement.items(): if attrib == "css": css = value elif attrib == "id": idStyle = value else: logging.warning( "Unprocessed '{}' attribute ({}) in style subsubelement" .format(attrib, value)) if Globals.debugFlag: assert (css or idStyle) SFM = None if css == "font-style:italic": SFM = '\\it' elif css == "font-style:italic;font-weight:bold": SFM = '\\bdit' elif css == "color:#FF0000": SFM = '\\em' elif css == "font-size: x-small; color:#8B8378": SFM = '\\add' elif css is None and idStyle == 'cl:divineName': SFM = '\\nd' else: print("css is", css, "idStyle is", idStyle) halt sText, sTail = subsubelement.text.strip( ), subsubelement.tail if Globals.debugFlag: assert (sText) if SFM: vText += SFM + ' ' + sText + SFM + '*' else: vText += '\\sc ' + '[' + css + ']' + sText + '\\sc* ' # Use sc for unknown styles if sTail: vText += sTail.strip() else: logging.error( "Expected to find {} but got '{}' in {}".format( ZefaniaXMLBible.styleTag, subsubelement.tag, sublocation)) elif subelement.tag == ZefaniaXMLBible.styleTag: sublocation = "style in " + location Globals.checkXMLNoSubelements(subelement, sublocation, 'f5gh') css = idStyle = None for attrib, value in subelement.items(): if attrib == "css": css = value elif attrib == "id": idStyle = value else: logging.warning( "Unprocessed '{}' attribute ({}) in style subelement" .format(attrib, value)) if Globals.debugFlag: assert (css or idStyle) SFM = None if css == "font-style:italic": SFM = '\\it' elif css == "font-style:italic;font-weight:bold": SFM = '\\bdit' elif css == "color:#FF0000": SFM = '\\em' elif css == "font-size: x-small; color:#8B8378": SFM = '\\add' elif css is None and idStyle == 'cl:divineName': SFM = '\\nd' else: print("css is", css, "idStyle is", idStyle) halt sText, sTail = subelement.text.strip(), subelement.tail if Globals.debugFlag: assert (sText) if SFM: vText += SFM + ' ' + sText + SFM + '*' else: vText += '\\sc ' + '[' + css + ']' + sText + '\\sc* ' # Use sc for unknown styles if sTail: vText += sTail.strip() elif subelement.tag == ZefaniaXMLBible.breakTag: sublocation = "line break in " + location Globals.checkXMLNoText(subelement, sublocation, 'c1d4') Globals.checkXMLNoSubelements(subelement, sublocation, 'g4g8') art = None for attrib, value in subelement.items(): if attrib == "art": art = value else: logging.warning( "Unprocessed '{}' attribute ({}) in style subelement" .format(attrib, value)) if Globals.debugFlag: assert (art == 'x-nl') #print( BBB, chapterNumber, verseNumber ) #assert( vText ) if vText: thisBook.appendLine('v', verseNumber + ' ' + vText) vText = '' thisBook.appendLine( 'm', subelement.tail.strip() if subelement.tail else '') #bTail = subelement.tail #if bTail: vText = bTail.strip() else: logging.error( "Expected to find NOTE or STYLE but got '{}' in {}".format( subelement.tag, location)) if vText: # This is the main text of the verse (follows the verse milestone) if '\n' in vText: print( "ZefaniaXMLBible.__validateAndExtractVerse: vText {} {}:{} '{}'" .format(BBB, chapterNumber, verseNumber, vText)) vText = vText.replace('\n', ' ') thisBook.appendLine('v', verseNumber + ' ' + vText)
class USXXMLBibleBook( BibleBook ): """ Class to load, validate, and manipulate a single Bible book in USX XML. """ def __init__( self, name, BBB ): """ Create the USX Bible book object. """ BibleBook.__init__( self, name, BBB ) # Initialise the base class self.objectNameString = "USX XML Bible Book object" self.objectTypeString = "USX" #self.bookReferenceCode = bookReferenceCode # end of USXXMLBibleBook.__init__ def load( self, filename, folder=None, encoding='utf-8' ): """ Load a single source USX XML file and extract the information. """ def loadParagraph( paragraphXML, paragraphlocation ): """ Load a paragraph from the USX XML. Uses (and updates) c,v information from the containing function. """ nonlocal c, v # Process the attributes first paragraphStyle = None for attrib,value in paragraphXML.items(): if attrib=='style': paragraphStyle = value # This is basically the USFM marker name else: logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) # Now process the paragraph text (or write a paragraph marker anyway) self.appendLine( paragraphStyle, paragraphXML.text if paragraphXML.text and paragraphXML.text.strip() else '' ) # Now process the paragraph subelements for element in paragraphXML: location = element.tag + ' ' + paragraphlocation #print( "USXXMLBibleBook.load", c, v, element.tag, location ) if element.tag == 'verse': # milestone (not a container) Globals.checkXMLNoText( element, location ) Globals.checkXMLNoSubelements( element, location ) # Process the attributes first verseStyle = None for attrib,value in element.items(): if attrib=='number': v = value elif attrib=='style': verseStyle = value else: logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) if verseStyle != 'v': logging.warning( _("Unexpected style attribute ({}) in {}").format( verseStyle, location ) ) self.appendLine( verseStyle, v + ' ' ) # Now process the tail (if there's one) which is the verse text if element.tail: vText = element.tail.strip() if vText: #print( repr(vText) ) self.appendToLastLine( vText ) elif element.tag == 'char': Globals.checkXMLNoSubelements( element, location ) # Process the attributes first charStyle = None for attrib,value in element.items(): if attrib=='style': charStyle = value # This is basically the USFM character marker name #print( " charStyle", charStyle ) assert( not Globals.USFMMarkers.isNewlineMarker( charStyle ) ) else: logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) # A character field must be added to the previous field tail = '' if element.tail is None else element.tail.strip() additionalText = "\\{} {}\\{}*{}".format( charStyle, element.text, charStyle, tail ) print( "USX.loadParagraph:", c, v, paragraphStyle, charStyle, repr(additionalText) ) self.appendToLastLine( additionalText ) elif element.tag == 'note': Globals.checkXMLNoText( element, location ) # Process the attributes first noteStyle = noteCaller = None for attrib,value in element.items(): if attrib=='style': noteStyle = value # This is basically the USFM marker name assert( noteStyle in ('x','f',) ) elif attrib=='caller': noteCaller = value # Usually hyphen or a symbol to be used for the note else: logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) assert( noteStyle and noteCaller ) # both compulsory noteLine = "\\{} {} ".format( noteStyle, noteCaller ) # Now process the subelements -- notes are one of the few multiply embedded fields in USX for subelement in element: sublocation = subelement.tag + ' ' + location #print( c, v, element.tag ) if subelement.tag == 'char': # milestone (not a container) Globals.checkXMLNoTail( subelement, sublocation ) Globals.checkXMLNoSubelements( subelement, sublocation ) # Process the attributes first charStyle, charClosed = None, True for attrib,value in subelement.items(): if attrib=='style': charStyle = value elif attrib=='closed': assert( value=='false' ) charClosed = False else: logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) ) noteLine += "\\{} {}".format( charStyle, subelement.text ) if charClosed: noteLine += "\\{}*".format( charStyle ) else: logging.warning( _("Unprocessed {} subelement after {} {}:{} in {}").format( subelement.tag, self.bookReferenceCode, c, v, sublocation ) ) self.addPriorityError( 1, c, v, _("Unprocessed {} subelement").format( subelement.tag ) ) if subelement.tail and subelement.tail.strip(): noteLine += subelement.tail #noteLine += "\\{}*".format( charStyle ) noteLine += "\\{}*".format( noteStyle ) if element.tail: noteText = element.tail.strip() noteLine += noteText self.appendToLastLine( noteLine ) elif element.tag == 'unmatched': # Used to denote errors in the source text Globals.checkXMLNoText( element, location ) Globals.checkXMLNoTail( element, location ) Globals.checkXMLNoAttributes( element, location ) Globals.checkXMLNoSubelements( element, location ) else: logging.warning( _("Unprocessed {} element after {} {}:{} in {}").format( element.tag, self.bookReferenceCode, c, v, location ) ) self.addPriorityError( 1, c, v, _("Unprocessed {} element").format( element.tag ) ) for x in range(max(0,len(self)-10),len(self)): print( x, self._rawLines[x] ) if Globals.debugFlag: halt # end of loadParagraph if Globals.verbosityLevel > 2: print( " " + _("Loading {}...").format( filename ) ) self.isOneChapterBook = self.bookReferenceCode in Globals.BibleBooksCodes.getSingleChapterBooksList() self.sourceFilename = filename self.sourceFolder = folder self.sourceFilepath = os.path.join( folder, filename ) if folder else filename self.tree = ElementTree().parse( self.sourceFilepath ) assert( len ( self.tree ) ) # Fail here if we didn't load anything at all c = v = '0' loadErrors = [] # Find the main container if self.tree.tag=='usx' or self.tree.tag=='usfm': # Not sure why both are allowable location = "USX ({}) file".format( self.tree.tag ) Globals.checkXMLNoText( self.tree, location ) Globals.checkXMLNoTail( self.tree, location ) # Process the attributes first self.schemaLocation = '' version = None for attrib,value in self.tree.items(): if attrib=='version': version = value logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) if version not in ( None, '2.0' ): logging.warning( _("Not sure if we can handle v{} USX files").format( version ) ) # Now process the data for element in self.tree: sublocation = element.tag + " " + location if element.tag == 'book': # milestone (not a container) Globals.checkXMLNoSubelements( element, sublocation ) Globals.checkXMLNoTail( element, sublocation ) # Process the attributes idField = bookStyle = None for attrib,value in element.items(): if attrib=='id' or attrib=='code': idField = value # Should be USFM bookcode (not like bookReferenceCode which is BibleOrgSys BBB bookcode) #if idField != bookReferenceCode: # logging.warning( _("Unexpected book code ({}) in {}").format( idField, sublocation ) ) elif attrib=='style': bookStyle = value else: logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) ) if bookStyle != 'id': logging.warning( _("Unexpected style attribute ({}) in {}").format( bookStyle, sublocation ) ) idLine = idField if element.text and element.text.strip(): idLine += ' ' + element.text self.appendLine( 'id', idLine ) elif element.tag == 'chapter': # milestone (not a container) v = '0' Globals.checkXMLNoText( element, sublocation ) Globals.checkXMLNoTail( element, sublocation ) Globals.checkXMLNoSubelements( element, sublocation ) # Process the attributes chapterStyle = None for attrib,value in element.items(): if attrib=='number': c = value elif attrib=='style': chapterStyle = value else: logging.warning( _("Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) ) if chapterStyle != 'c': logging.warning( _("Unexpected style attribute ({}) in {}").format( chapterStyle, sublocation ) ) self.appendLine( 'c', c ) elif element.tag == 'para': Globals.checkXMLNoTail( element, sublocation ) USFMMarker = element.attrib['style'] # Get the USFM code for the paragraph style if Globals.USFMMarkers.isNewlineMarker( USFMMarker ): #if lastMarker: self.appendLine( lastMarker, lastText ) #lastMarker, lastText = USFMMarker, text loadParagraph( element, sublocation ) elif Globals.USFMMarkers.isInternalMarker( USFMMarker ): # the line begins with an internal USFM Marker -- append it to the previous line text = element.text if text is None: text = '' if Globals.debugFlag: print( _("{} {}:{} Found '\\{}' internal USFM marker at beginning of line with text: {}").format( self.bookReferenceCode, c, v, USFMMarker, text ) ) #halt # Not checked yet if text: loadErrors.append( _("{} {}:{} Found '\\{}' internal USFM marker at beginning of line with text: {}").format( self.bookReferenceCode, c, v, USFMMarker, text ) ) logging.warning( _("Found '\\{}' internal USFM Marker after {} {}:{} at beginning of line with text: {}").format( USFMMarker, self.bookReferenceCode, c, v, text ) ) else: # no text loadErrors.append( _("{} {}:{} Found '\\{}' internal USFM Marker at beginning of line (with no text)").format( self.bookReferenceCode, c, v, USFMMarker ) ) logging.warning( _("Found '\\{}' internal USFM Marker after {} {}:{} at beginning of line (with no text)").format( USFMMarker, self.bookReferenceCode, c, v ) ) self.addPriorityError( 97, c, v, _("Found \\{} internal USFM Marker on new line in file").format( USFMMarker ) ) #lastText += '' if lastText.endswith(' ') else ' ' # Not always good to add a space, but it's their fault! lastText = '\\' + USFMMarker + ' ' + text #print( "{} {} {} Now have {}:'{}'".format( self.bookReferenceCode, c, v, lastMarker, lastText ) ) else: # the line begins with an unknown USFM Marker text = element.text if text: loadErrors.append( _("{} {}:{} Found '\\{}' unknown USFM Marker at beginning of line with text: {}").format( self.bookReferenceCode, c, v, USFMMarker, text ) ) logging.error( _("Found '\\{}' unknown USFM Marker after {} {}:{} at beginning of line with text: {}").format( USFMMarker, self.bookReferenceCode, c, v, text ) ) else: # no text loadErrors.append( _("{} {}:{} Found '\\{}' unknown USFM Marker at beginning of line (with no text").format( self.bookReferenceCode, c, v, USFMMarker ) ) logging.error( _("Found '\\{}' unknown USFM Marker after {} {}:{} at beginning of line (with no text)").format( USFMMarker, self.bookReferenceCode, c, v ) ) self.addPriorityError( 100, c, v, _("Found \\{} unknown USFM Marker on new line in file").format( USFMMarker ) ) for tryMarker in sortedNLMarkers: # Try to do something intelligent here -- it might be just a missing space if USFMMarker.startswith( tryMarker ): # Let's try changing it if lastMarker: self.appendLine( lastMarker, lastText ) lastMarker, lastText = tryMarker, USFMMarker[len(tryMarker):] + ' ' + text loadErrors.append( _("{} {}:{} Changed '\\{}' unknown USFM Marker to '{}' at beginning of line: {}").format( self.bookReferenceCode, c, v, USFMMarker, tryMarker, text ) ) logging.warning( _("Changed '\\{}' unknown USFM Marker to '{}' after {} {}:{} at beginning of line: {}").format( USFMMarker, tryMarker, self.bookReferenceCode, c, v, text ) ) break # Otherwise, don't bother processing this line -- it'll just cause more problems later on else: logging.warning( _("Unprocessed {} element after {} {}:{} in {}").format( element.tag, self.bookReferenceCode, c, v, sublocation ) ) self.addPriorityError( 1, c, v, _("Unprocessed {} element").format( element.tag ) ) if loadErrors: self.errorDictionary['Load Errors'] = loadErrors
class HaggaiXMLBible( Bible ): """ Class for reading, validating, and converting HaggaiXMLBible XML. """ XMLNameSpace = "{http://www.w3.org/2001/XMLSchema-instance}" treeTag = 'XMLBIBLE' infoTag = 'INFORMATION' bookTag = 'BIBLEBOOK' chapterTag = 'CHAPTER' captionTag = 'CAPTION' paragraphTag = 'PARAGRAPH' verseTag = 'VERSE' noteTag = 'NOTE' styleTag = 'STYLE' breakTag = 'BR' def __init__( self, sourceFolder, givenName, encoding='utf-8' ): """ Constructor: just sets up the Haggai Bible object. """ # Setup and initialise the base class first Bible.__init__( self ) self.objectNameString = 'Haggai XML Bible object' self.objectTypeString = 'Haggai' # Now we can set our object variables self.sourceFolder, self.givenName, self.encoding = sourceFolder, givenName, encoding self.sourceFilepath = os.path.join( self.sourceFolder, self.givenName ) self.tree = self.header = None # Will hold the XML data # Get the data tables that we need for proper checking #self.ISOLanguages = ISO_639_3_Languages().loadData() self.genericBOS = BibleOrganizationalSystem( 'GENERIC-KJV-66-ENG' ) # Do a preliminary check on the readability of our file if not os.access( self.sourceFilepath, os.R_OK ): print( "HaggaiXMLBible: File {!r} is unreadable".format( self.sourceFilepath ) ) self.name = self.givenName #if self.name is None: #pass # end of HaggaiXMLBible.__init__ def load( self ): """ Load a single source XML file and load book elements. """ if BibleOrgSysGlobals.verbosityLevel > 2: print( _("Loading {}…").format( self.sourceFilepath ) ) try: self.tree = ElementTree().parse( self.sourceFilepath ) except ParseError as err: logging.critical( exp("Loader parse error in xml file {}: {} {}").format( self.givenName, sys.exc_info()[0], err ) ) #loadErrors.append( exp("Loader parse error in xml file {}: {} {}").format( self.givenName, sys.exc_info()[0], err ) ) #self.addPriorityError( 100, C, V, _("Loader parse error in xml file {}: {}").format( self.givenName, err ) ) if BibleOrgSysGlobals.debugFlag: assert len ( self.tree ) # Fail here if we didn't load anything at all # Find the main (bible) container if self.tree.tag == HaggaiXMLBible.treeTag: location = "Haggai XML file" BibleOrgSysGlobals.checkXMLNoText( self.tree, location, '4f6h' ) BibleOrgSysGlobals.checkXMLNoTail( self.tree, location, '1wk8' ) schema = name = status = BibleType = revision = version = lgid = None for attrib,value in self.tree.items(): if attrib == HaggaiXMLBible.XMLNameSpace + 'noNamespaceSchemaLocation': schema = value elif attrib == "biblename": name = value elif attrib == "lgid": lgid = value # In italian.xml this is set to "german" elif attrib == "status": status = value elif attrib == "type": BibleType = value elif attrib == "revision": revision = value elif attrib == 'version': version = value else: logging.warning( "Unprocessed {!r} attribute ({}) in main element".format( attrib, value ) ) if name: self.name = name if status: self.status = status if revision: self.revision = revision if version: self.version = version if self.tree[0].tag == 'INFORMATION': self.header = self.tree[0] self.tree.remove( self.header ) self.__validateAndExtractHeader() else: # Handle information records at the END of the file ix = len(self.tree) - 1 if self.tree[ix].tag == 'INFORMATION': self.header = self.tree[ix] self.tree.remove( self.header ) self.__validateAndExtractHeader() # Find the submain (book) containers for element in self.tree: if element.tag == HaggaiXMLBible.bookTag: sublocation = "book in " + location BibleOrgSysGlobals.checkXMLNoText( element, sublocation, 'g3g5' ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'd3f6' ) self.__validateAndExtractBook( element ) else: logging.error( "Expected to find {!r} but got {!r}".format( HaggaiXMLBible.bookTag, element.tag ) ) else: logging.error( "Expected to load {!r} but got {!r}".format( HaggaiXMLBible.treeTag, self.tree.tag ) ) self.doPostLoadProcessing() # end of HaggaiXMLBible.load def __validateAndExtractHeader( self ): """ Extracts information out of the header record, such as: <INFORMATION> <title>King James Version</title> <creator></creator> <subject>The Holy Bible</subject> <description>In 1604, King James I of England authorized that a new translation of the Bible into English be started. It was finished in 1611, just 85 years after the first translation of the New Testament into English appeared (Tyndale, 1526). The Authorized Version, or King James Version, quickly became the standard for English-speaking Protestants. Its flowing language and prose rhythm has had a profound influence on the literature of the past 300 years.</description> <publisher>FREE BIBLE SOFTWARE GROUP</publisher> <contributors /> <date>2009-01-23</date> <type>Bible</type> <format>Haggai XML Bible Markup Language</format> <identifier>kjv</identifier> <source>http://www.unboundbible.com/zips/index.cfm?lang=English</source> <language>ENG</language> <coverage>provide the Bible to the nations of the world</coverage> <rights>We believe that this Bible is found in the Public Domain.</rights> </INFORMATION> """ if BibleOrgSysGlobals.debugFlag: assert self.header location = 'Header' BibleOrgSysGlobals.checkXMLNoAttributes( self.header, location, 'j4j6' ) BibleOrgSysGlobals.checkXMLNoText( self.header, location, 'sk4l' ) BibleOrgSysGlobals.checkXMLNoTail( self.header, location, 'a2d4' ) # TODO: We probably need to rationalise some of the self.xxx stores for element in self.header: #print( "header", element.tag ) if element.tag == 'title': sublocation = "title in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if BibleOrgSysGlobals.debugFlag: assert element.text self.title = element.text elif element.tag == 'creator': sublocation = "creator in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if element.text: self.creator = element.text elif element.tag == 'subject': sublocation = "subject in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if element.text: self.subject = element.text elif element.tag == 'description': sublocation = "description in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if BibleOrgSysGlobals.debugFlag: assert element.text self.description = element.text elif element.tag == 'publisher': sublocation = "publisher in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if element.text: self.publisher = element.text elif element.tag == 'contributor': sublocation = "contributor in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'alj1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jjd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5gk78' ) if element.text: try: self.contributor = [ self.contributor, element.text ] # Put multiples into a list except AttributeError: self.contributor = element.text # Must be the first (and possibly only) one elif element.tag == 'contributors': sublocation = "contributors in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if element.text: self.contributors = element.text elif element.tag == 'date': sublocation = "date in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if BibleOrgSysGlobals.debugFlag: assert element.text self.date = element.text elif element.tag == 'type': sublocation = "type in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if element.text: self.documentType = element.text elif element.tag == 'format': sublocation = "format in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if BibleOrgSysGlobals.debugFlag: assert element.text if BibleOrgSysGlobals.debugFlag: assert element.text == 'Haggai XML Bible Markup Language' elif element.tag == 'identifier': sublocation = "identifier in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if BibleOrgSysGlobals.debugFlag: assert element.text self.identifier = element.text elif element.tag == 'source': sublocation = "source in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if BibleOrgSysGlobals.debugFlag: assert element.text self.source = element.text elif element.tag == 'language': sublocation = "language in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if BibleOrgSysGlobals.debugFlag: assert element.text self.language = element.text elif element.tag == 'coverage': sublocation = "coverage in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if element.text: self.coverage = element.text elif element.tag == 'rights': sublocation = "rights in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if element.text: self.rights = element.text else: logging.error( "Found unexpected {!r} tag in {}".format( element.tag, location ) ) # end of HaggaiXMLBible.__validateAndExtractHeader def __validateAndExtractBook( self, book ): """ Check/validate and extract book data from the given XML book record finding chapter subelements. """ if BibleOrgSysGlobals.verbosityLevel > 3: print( _("Validating XML book…") ) # Process the div attributes first BBB = bookName = bookShortName = bookNumber = None for attrib,value in book.items(): if attrib=="bnumber": bookNumber = value elif attrib=="bname": bookName = value elif attrib=="bsname": bookShortName = value else: logging.warning( "Unprocessed {!r} attribute ({}) in book element".format( attrib, value ) ) if bookNumber: try: BBB = BibleOrgSysGlobals.BibleBooksCodes.getBBBFromReferenceNumber( bookNumber ) except KeyError: logging.warning( "Unable to deduce which book is number={}, name={}, shortName={} -- ignoring it" \ .format( bookNumber, bookName, bookShortName ) ) elif bookName: BBB = self.genericBOS.getBBBFromText( bookName ) if BBB: if BibleOrgSysGlobals.verbosityLevel > 2: print( _("Validating {} {}…").format( BBB, bookName ) ) thisBook = BibleBook( self, BBB ) thisBook.objectNameString = 'Haggai XML Bible Book object' thisBook.objectTypeString = 'Haggai' #thisBook.sourceFilepath = self.sourceFilepath for element in book: if element.tag == HaggaiXMLBible.captionTag: sublocation = "caption in {}".format( BBB ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'jhl6' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, 'jk21' ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'kjh6' ) thisBook.addLine( 'mt', element.text ) elif element.tag == HaggaiXMLBible.chapterTag: sublocation = "chapter in {}".format( BBB ) BibleOrgSysGlobals.checkXMLNoText( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) self.__validateAndExtractChapter( BBB, thisBook, element ) else: logging.error( "Expected to find {!r} but got {!r}".format( HaggaiXMLBible.chapterTag, element.tag ) ) if BibleOrgSysGlobals.verbosityLevel > 2: print( " Saving {} into results…".format( BBB ) ) self.stashBook( thisBook ) # end of HaggaiXMLBible.__validateAndExtractBook def __validateAndExtractChapter( self, BBB, thisBook, chapter ): """ Check/validate and extract chapter data from the given XML book record finding and saving chapter numbers and finding and saving verse elements. """ if BibleOrgSysGlobals.verbosityLevel > 3: print( _("Validating XML chapter…") ) # Process the chapter attributes first chapterNumber = numVerses = None for attrib,value in chapter.items(): if attrib=="cnumber": chapterNumber = value else: logging.warning( "Unprocessed {!r} attribute ({}) in chapter element".format( attrib, value ) ) if chapterNumber: #print( BBB, 'c', chapterNumber ) thisBook.addLine( 'c', chapterNumber ) else: logging.error( "Missing 'n' attribute in chapter element for {}".format( BBB ) ) for element in chapter: if element.tag == HaggaiXMLBible.paragraphTag: location = "paragraph in {} {}".format( BBB, chapterNumber ) self.__validateAndExtractParagraph( BBB, chapterNumber, thisBook, element ) elif element.tag == HaggaiXMLBible.verseTag+'disabled': location = "verse in {} {}".format( BBB, chapterNumber ) self.__validateAndExtractVerse( BBB, chapterNumber, thisBook, element ) elif element.tag == HaggaiXMLBible.captionTag+'disabled': # Used in Psalms location = "caption in {} {}".format( BBB, chapterNumber ) BibleOrgSysGlobals.checkXMLNoTail( element, location, 'k5k8' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'd3f5' ) # Handle caption attributes vRef = None for attrib,value in element.items(): if attrib=="vref": vRef = value if BibleOrgSysGlobals.debugFlag: assert vRef == '1' else: logging.warning( "Unprocessed {!r} attribute ({}) in caption element".format( attrib, value ) ) if BibleOrgSysGlobals.debugFlag: assert vRef vText = element.text if not vText: logging.warning( "{} {}:{} has no text".format( BBB, chapterNumber, vRef ) ) if vText: # This is the main text of the caption #print( "{} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, vText ) ) thisBook.addLine( 'v', '0' + ' ' + vText ) # We save it as verse zero else: logging.error( "Expected to find {!r} but got {!r}".format( HaggaiXMLBible.verseTag, element.tag ) ) # end of HaggaiXMLBible.__validateAndExtractChapter def __validateAndExtractParagraph( self, BBB, chapterNumber, thisBook, paragraph ): """ Check/validate and extract paragraph data from the given XML book record finding and saving paragraphs and finding and saving verse elements. """ if BibleOrgSysGlobals.verbosityLevel > 3: print( _("Validating XML paragraph…") ) location = "paragraph in {} {}".format( BBB, chapterNumber ) BibleOrgSysGlobals.checkXMLNoAttributes( paragraph, location, 'brgw3' ) BibleOrgSysGlobals.checkXMLNoText( paragraph, location, 'brgw3' ) BibleOrgSysGlobals.checkXMLNoTail( paragraph, location, 'brgw3' ) thisBook.addLine( 'p', '' ) # Handle verse subelements (verses) for element in paragraph: if element.tag == HaggaiXMLBible.verseTag: location = "verse in {} {}".format( BBB, chapterNumber ) self.__validateAndExtractVerse( BBB, chapterNumber, thisBook, element ) elif element.tag == HaggaiXMLBible.captionTag+'disabled': # Used in Psalms location = "caption in {} {}".format( BBB, chapterNumber ) BibleOrgSysGlobals.checkXMLNoTail( element, location, 'k5k8' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'd3f5' ) # Handle caption attributes vRef = None for attrib,value in element.items(): if attrib=="vref": vRef = value if BibleOrgSysGlobals.debugFlag: assert vRef == '1' else: logging.warning( "Unprocessed {!r} attribute ({}) in caption element".format( attrib, value ) ) if BibleOrgSysGlobals.debugFlag: assert vRef vText = element.text if not vText: logging.warning( "{} {}:{} has no text".format( BBB, chapterNumber, vRef ) ) if vText: # This is the main text of the caption #print( "{} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, vText ) ) thisBook.addLine( 'v', '0' + ' ' + vText ) # We save it as verse zero else: logging.error( "Expected to find {!r} but got {!r}".format( HaggaiXMLBible.verseTag, element.tag ) ) # end of HaggaiXMLBible.__validateAndExtractParagraph def __validateAndExtractVerse( self, BBB, chapterNumber, thisBook, verse ): """ Check/validate and extract verse data from the given XML book record finding and saving verse elements. """ if BibleOrgSysGlobals.verbosityLevel > 3: print( _("Validating XML verse…") ) location = "verse in {} {}".format( BBB, chapterNumber ) BibleOrgSysGlobals.checkXMLNoTail( verse, location, 'l5ks' ) # Handle verse attributes verseNumber = toVerseNumber = None for attrib,value in verse.items(): if attrib=="vnumber": verseNumber = value else: logging.warning( "Unprocessed {!r} attribute ({}) in verse element".format( attrib, value ) ) if BibleOrgSysGlobals.debugFlag: assert verseNumber location = "{}:{}".format( location, verseNumber ) # Get a better location description #thisBook.addLine( 'v', verseNumber ) vText = '' if verse.text is None else verse.text if vText: vText = vText.strip() #if not vText: # This happens if a verse starts immediately with a style or note #logging.warning( "{} {}:{} has no text".format( BBB, chapterNumber, verseNumber ) ) # Handle verse subelements (notes and styled portions) for subelement in verse: if subelement.tag == HaggaiXMLBible.noteTag: sublocation = "note in " + location noteType = None for attrib,value in subelement.items(): if attrib=="type": noteType = value else: logging.warning( "Unprocessed {!r} attribute ({}) in style subelement".format( attrib, value ) ) if noteType and noteType not in ('variant',): logging.warning( "Unexpected {} note type in {}".format( noteType, BBB ) ) nText, nTail = subelement.text, subelement.tail #print( "note", BBB, chapterNumber, verseNumber, noteType, repr(nText), repr(nTail) ) vText += "\\f + \\fk {} \\ft {}\\f*".format( noteType, nText ) if noteType else "\\f + \\ft {}\\f*".format( nText ) if nTail: if '\n' in nTail: print( "HaggaiXMLBible.__validateAndExtractVerse: nTail {} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, nTail ) ) nTail = nTail.replace( '\n', ' ' ) vText += nTail for subsubelement in subelement: if subsubelement.tag == HaggaiXMLBible.styleTag: subsublocation = "style in " + sublocation BibleOrgSysGlobals.checkXMLNoSubelements( subsubelement, subsublocation, 'fyt4' ) fs = css = idStyle = None for attrib,value in subsubelement.items(): if attrib=='fs': fs = value #elif attrib=="css": css = value #elif attrib=="id": idStyle = value else: logging.warning( "Unprocessed {!r} attribute ({}) in style subsubelement".format( attrib, value ) ) if BibleOrgSysGlobals.debugFlag: assert fs or css or idStyle SFM = None if fs == 'italic': SFM = '\\it' elif fs == 'super': SFM = '\\bdit' elif fs == 'emphasis': SFM = '\\em' else: print( "fs is", fs, "css is", css, "idStyle is", idStyle ); halt #if css == "font-style:italic": SFM = '\\it' #elif css == "font-style:italic;font-weight:bold": SFM = '\\bdit' #elif css == "color:#FF0000": SFM = '\\em' #elif css == "font-size: x-small; color:#8B8378": SFM = '\\add' #elif css is None and idStyle=='cl:divineName': SFM = '\\nd' #else: print( "css is", css, "idStyle is", idStyle ); halt sText, sTail = subsubelement.text.strip(), subsubelement.tail if BibleOrgSysGlobals.debugFlag: assert sText if SFM: vText += SFM+' ' + sText + SFM+'*' else: vText += '\\sc ' + '['+css+']' + sText + '\\sc* ' # Use sc for unknown styles if sTail: vText += sTail.strip() else: logging.error( "Expected to find {} but got {!r} in {}".format( HaggaiXMLBible.styleTag, subsubelement.tag, sublocation ) ) elif subelement.tag == HaggaiXMLBible.styleTag: sublocation = "style in " + location BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation, 'f5gh' ) fs = css = idStyle = None for attrib,value in subelement.items(): if attrib=="fs": fs = value #elif attrib=="css": css = value #elif attrib=="id": idStyle = value else: logging.warning( "Unprocessed {!r} attribute ({}) in style subelement".format( attrib, value ) ) if BibleOrgSysGlobals.debugFlag: assert fs SFM = None if fs == 'super': SFM = '\\bdit' elif fs == 'emphasis': SFM = '\\em' else: print( "fs is", fs, "css is", css, "idStyle is", idStyle ); halt #if css == "font-style:italic": SFM = '\\it' #elif css == "font-style:italic;font-weight:bold": SFM = '\\bdit' #elif css == "color:#FF0000": SFM = '\\em' #elif css == "font-size: x-small; color:#8B8378": SFM = '\\add' #elif css is None and idStyle=='cl:divineName': SFM = '\\nd' #else: print( "css is", css, "idStyle is", idStyle ); halt sText, sTail = subelement.text.strip(), subelement.tail if BibleOrgSysGlobals.debugFlag: assert sText #print( BBB, chapterNumber, sublocation ) if SFM: vText += SFM+' ' + sText + SFM+'*' else: vText += '\\sc ' + '['+css+']' + sText + '\\sc* ' # Use sc for unknown styles if sTail: vText += sTail.strip() elif subelement.tag == HaggaiXMLBible.breakTag: sublocation = "line break in " + location BibleOrgSysGlobals.checkXMLNoText( subelement, sublocation, 'c1d4' ) BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation, 'g4g8' ) art = None for attrib,value in subelement.items(): if attrib=="art": art = value else: logging.warning( "Unprocessed {!r} attribute ({}) in style subelement".format( attrib, value ) ) if BibleOrgSysGlobals.debugFlag: assert art == 'x-nl' #print( BBB, chapterNumber, verseNumber ) #assert vText if vText: thisBook.addLine( 'v', verseNumber + ' ' + vText ); verseNumber = None vText = '' thisBook.addLine( 'm', subelement.tail.strip() if subelement.tail else '' ) #bTail = subelement.tail #if bTail: vText = bTail.strip() else: logging.error( "Expected to find NOTE or STYLE but got {!r} in {}".format( subelement.tag, location ) ) if vText: # This is the main text of the verse (follows the verse milestone) if '\n' in vText: print( "HaggaiXMLBible.__validateAndExtractVerse: vText {} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, vText ) ) vText = vText.replace( '\n', ' ' ) thisBook.addLine( 'v', verseNumber + ' ' + vText ); verseNumber = None
class USXXMLBibleBook( BibleBook ): """ Class to load, validate, and manipulate a single Bible book in USX XML. """ def __init__( self, name, BBB ): """ Create the USX Bible book object. """ BibleBook.__init__( self, name, BBB ) # Initialise the base class self.objectNameString = 'USX XML Bible Book object' self.objectTypeString = 'USX' global sortedNLMarkers if sortedNLMarkers is None: sortedNLMarkers = sorted( BibleOrgSysGlobals.USFMMarkers.getNewlineMarkersList('Combined'), key=len, reverse=True ) #self.BBB = BBB # end of USXXMLBibleBook.__init__ def load( self, filename, folder=None, encoding='utf-8' ): """ Load a single source USX XML file and extract the information. """ if BibleOrgSysGlobals.debugFlag and debuggingThisModule: print( exp("load( {}, {}, {} )").format( filename, folder, encoding ) ) def loadParagraph( paragraphXML, paragraphlocation ): """ Load a paragraph from the USX XML. In this context, paragraph means heading and intro lines, as well as paragraphs of verses. Uses (and updates) C,V information from the containing function. """ nonlocal C, V # Process the attributes first paragraphStyle = None for attrib,value in paragraphXML.items(): if attrib=='style': paragraphStyle = value # This is basically the USFM marker name else: logging.warning( _("CH46 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) # Now process the paragraph text (or write a paragraph marker anyway) paragraphText = paragraphXML.text if paragraphXML.text and paragraphXML.text.strip() else '' if version is None: paragraphText = paragraphText.rstrip() # Don't need to strip extra spaces in v2 self.addLine( paragraphStyle, paragraphText ) # Now process the paragraph subelements for element in paragraphXML: location = element.tag + ' ' + paragraphlocation #print( "USXXMLBibleBook.load", C, V, element.tag, location ) if element.tag == 'verse': # milestone (not a container) BibleOrgSysGlobals.checkXMLNoText( element, location ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location ) # Process the attributes first verseStyle = altNumber = None for attrib,value in element.items(): if attrib=='number': V = value elif attrib=='style': verseStyle = value elif attrib=='altnumber': altNumber = value else: logging.error( _("KR60 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) if verseStyle != 'v': logging.error( _("Unexpected style attribute ({}) in {}").format( verseStyle, location ) ) #if altNumber: print( repr(verseStyle), repr(altNumber) ); halt altStuff = ' \\va {}\\va*'.format( altNumber ) if altNumber else '' self.addLine( verseStyle, V + altStuff + ' ' ) # Now process the tail (if there's one) which is the verse text if element.tail: vText = element.tail if vText[0]=='\n': vText = vText.lstrip() # Paratext puts cross references on a new line if vText: #print( repr(vText) ) self.appendToLastLine( vText ) elif element.tag == 'char': # Process the attributes first charStyle = None for attrib,value in element.items(): if attrib=='style': charStyle = value # This is basically the USFM character marker name #print( " charStyle", charStyle ) assert not BibleOrgSysGlobals.USFMMarkers.isNewlineMarker( charStyle ) else: logging.error( _("QU52 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) charLine = "\\{} {} ".format( charStyle, element.text ) # Now process the subelements -- chars are one of the few multiply embedded fields in USX for subelement in element: sublocation = subelement.tag + ' ' + location #print( '{} {}:{} {}'.format( self.BBB, C, V, element.tag ) ) if subelement.tag == 'char': # milestone (not a container) BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation ) # Process the attributes first subCharStyle, charClosed = None, True for attrib,value in subelement.items(): if attrib=='style': subCharStyle = value elif attrib=='closed': assert value=='false' charClosed = False else: logging.error( _("KS41 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) ) charLine += "\\{} {}".format( subCharStyle, subelement.text ) if charClosed: charLine += "\\{}*".format( subCharStyle ) #if subelement.tail is not None: print( " tail1", repr(subelement.tail) ) charLine += '' if subelement.tail is None else subelement.tail else: logging.error( _("Unprocessed {} subelement after {} {}:{} in {}").format( subelement.tag, self.BBB, C, V, sublocation ) ) self.addPriorityError( 1, C, V, _("Unprocessed {} subelement").format( subelement.tag ) ) # A character field must be added to the previous field #if element.tail is not None: print( " tail2", repr(element.tail) ) charTail = '' if element.tail: charTail = element.tail if charTail[0]=='\n': charTail = charTail.lstrip() # Paratext puts footnote parts on new lines charLine += "\\{}*{}".format( charStyle, charTail ) #if debuggingThisModule: print( "USX.loadParagraph:", C, V, paragraphStyle, charStyle, repr(charLine) ) self.appendToLastLine( charLine ) elif element.tag == 'note': #print( "NOTE", BibleOrgSysGlobals.elementStr( element ) ) # Process the attributes first noteStyle = noteCaller = None for attrib,value in element.items(): if attrib=='style': noteStyle = value # This is basically the USFM marker name assert noteStyle in ('x','f',) elif attrib=='caller': noteCaller = value # Usually hyphen or a symbol to be used for the note else: logging.error( _("CY38 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) if noteCaller=='' and self.BBB=='NUM' and C=='10' and V=='36': noteCaller = '+' # Hack assert noteStyle and noteCaller # both compulsory noteLine = "\\{} {} ".format( noteStyle, noteCaller ) if element.text: noteText = element.text.strip() noteLine += noteText # Now process the subelements -- notes are one of the few multiply embedded fields in USX for subelement in element: sublocation = subelement.tag + ' ' + location #print( C, V, subelement.tag ) if subelement.tag == 'char': # milestone (not a container) # Process the attributes first charStyle, charClosed = None, True for attrib,value in subelement.items(): if attrib=='style': charStyle = value elif attrib=='closed': assert value=='false' charClosed = False else: logging.warning( _("GJ67 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) ) noteLine += "\\{} {}".format( charStyle, subelement.text ) # Now process the subelements -- notes are one of the few multiply embedded fields in USX for sub2element in subelement: sub2location = sub2element.tag + ' ' + sublocation #print( C, V, sub2element.tag ) if sub2element.tag == 'char': # milestone (not a container) BibleOrgSysGlobals.checkXMLNoSubelements( sub2element, sub2location ) # Process the attributes first char2Style, char2Closed = None, True for attrib,value in sub2element.items(): if attrib=='style': char2Style = value elif attrib=='closed': assert value=='false' char2Closed = False else: logging.warning( _("VH36 Unprocessed {} attribute ({}) in {}").format( attrib, value, sub2location ) ) assert char2Closed noteLine += "\\{} {}\\{}*{}".format( char2Style, sub2element.text, char2Style, sub2element.tail if sub2element.tail else '' ) if charClosed: noteLine += "\\{}*".format( charStyle ) if subelement.tail: charTail = subelement.tail if charTail[0]=='\n': charTail = charTail.lstrip() # Paratext puts cross reference parts on a new line noteLine += charTail elif subelement.tag == 'unmatched': # Used to denote errors in the source text BibleOrgSysGlobals.checkXMLNoText( subelement, sublocation ) BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation ) # Process the attributes first unmmatchedMarker = None for attrib,value in subelement.items(): if attrib=='marker': unmmatchedMarker = value else: logging.warning( _("NV21 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) ) self.addPriorityError( 2, C, V, _("Unmatched subelement for {} in {}").format( repr(unmmatchedMarker), sublocation) if unmmatchedMarker else _("Unmatched subelement in {}").format( sublocation) ) else: logging.warning( _("Unprocessed {} subelement after {} {}:{} in {}").format( subelement.tag, self.BBB, C, V, sublocation ) ) self.addPriorityError( 1, C, V, _("Unprocessed {} subelement").format( subelement.tag ) ) if subelement.tail and subelement.tail.strip(): noteLine += subelement.tail #noteLine += "\\{}*".format( charStyle ) noteLine += "\\{}*".format( noteStyle ) if element.tail: #if '\n' in element.tail: halt noteTail = element.tail if noteTail[0]=='\n': noteTail = noteTail.lstrip() # Paratext puts multiple cross-references on new lines noteLine += noteTail #print( "NoteLine", repr(noteLine) ) self.appendToLastLine( noteLine ) elif element.tag == 'link': # Used to include extra resources BibleOrgSysGlobals.checkXMLNoText( element, location ) BibleOrgSysGlobals.checkXMLNoTail( element, location ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location ) # Process the attributes first linkStyle = linkDisplay = linkTarget = None for attrib,value in element.items(): if attrib=='style': linkStyle = value assert linkStyle in ('jmp',) elif attrib=='display': linkDisplay = value # e.g., "click here" elif attrib=='target': linkTarget = value # e.g., some reference else: logging.warning( _("KW54 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) self.addPriorityError( 3, C, V, _("Unprocessed {} link to {} in {}").format( repr(linkDisplay), repr(linkTarget), location) ) elif element.tag == 'unmatched': # Used to denote errors in the source text BibleOrgSysGlobals.checkXMLNoText( element, location ) BibleOrgSysGlobals.checkXMLNoTail( element, location ) BibleOrgSysGlobals.checkXMLNoAttributes( element, location ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location ) self.addPriorityError( 2, C, V, _("Unmatched element in {}").format( location) ) else: logging.warning( _("Unprocessed {} element after {} {}:{} in {}").format( element.tag, self.BBB, C, V, location ) ) self.addPriorityError( 1, C, V, _("Unprocessed {} element").format( element.tag ) ) for x in range(max(0,len(self)-10),len(self)): print( x, self._rawLines[x] ) if BibleOrgSysGlobals.debugFlag: halt # end of loadParagraph C = V = '0' loadErrors = [] lastMarker = None if BibleOrgSysGlobals.verbosityLevel > 3: print( " " + _("Loading {} from {}…").format( filename, folder ) ) elif BibleOrgSysGlobals.verbosityLevel > 2: print( " " + _("Loading {}…").format( filename ) ) self.isOneChapterBook = self.BBB in BibleOrgSysGlobals.BibleBooksCodes.getSingleChapterBooksList() self.sourceFilename = filename self.sourceFolder = folder self.sourceFilepath = os.path.join( folder, filename ) if folder else filename try: self.tree = ElementTree().parse( self.sourceFilepath ) except ParseError as err: logging.critical( exp("Loader parse error in xml file {}: {} {}").format( filename, sys.exc_info()[0], err ) ) loadErrors.append( exp("Loader parse error in xml file {}: {} {}").format( filename, sys.exc_info()[0], err ) ) self.addPriorityError( 100, C, V, _("Loader parse error in xml file {}: {}").format( filename, err ) ) if BibleOrgSysGlobals.debugFlag: assert len ( self.tree ) # Fail here if we didn't load anything at all # Find the main container if 'tree' in dir(self) \ and ( self.tree.tag=='usx' or self.tree.tag=='usfm' ): # Not sure why both are allowable location = "USX ({}) file".format( self.tree.tag ) BibleOrgSysGlobals.checkXMLNoText( self.tree, location ) BibleOrgSysGlobals.checkXMLNoTail( self.tree, location ) # Process the attributes first self.schemaLocation = '' version = None for attrib,value in self.tree.items(): if attrib=='version': version = value else: logging.warning( _("DG84 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) if version not in ( None, '2.0' ): logging.warning( _("Not sure if we can handle v{} USX files").format( version ) ) # Now process the data for element in self.tree: sublocation = element.tag + " " + location if element.tag == 'book': # milestone (not a container) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation ) # Process the attributes idField = bookStyle = None for attrib,value in element.items(): if attrib=='id' or attrib=='code': idField = value # Should be USFM bookcode (not like BBB which is BibleOrgSys BBB bookcode) #if idField != BBB: # logging.warning( _("Unexpected book code ({}) in {}").format( idField, sublocation ) ) elif attrib=='style': bookStyle = value else: logging.warning( _("MD12 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) ) if bookStyle != 'id': logging.warning( _("Unexpected style attribute ({}) in {}").format( bookStyle, sublocation ) ) idLine = idField if element.text and element.text.strip(): idLine += ' ' + element.text self.addLine( 'id', idLine ) elif element.tag == 'chapter': # milestone (not a container) V = '0' BibleOrgSysGlobals.checkXMLNoText( element, sublocation ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation ) # Process the attributes chapterStyle = pubNumber = None for attrib,value in element.items(): if attrib=='number': C = value elif attrib=='style': chapterStyle = value elif attrib=='pubnumber': pubNumber = value else: logging.error( _("LY76 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) ) if chapterStyle != 'c': logging.warning( _("Unexpected style attribute ({}) in {}").format( chapterStyle, sublocation ) ) #if pubNumber: print( self.BBB, C, repr(pubNumber) ); halt self.addLine( 'c', C ) if pubNumber: self.addLine( 'cp', pubNumber ) elif element.tag == 'para': BibleOrgSysGlobals.checkXMLNoTail( element, sublocation ) USFMMarker = element.attrib['style'] # Get the USFM code for the paragraph style if BibleOrgSysGlobals.USFMMarkers.isNewlineMarker( USFMMarker ): #if lastMarker: self.addLine( lastMarker, lastText ) #lastMarker, lastText = USFMMarker, text loadParagraph( element, sublocation ) elif BibleOrgSysGlobals.USFMMarkers.isInternalMarker( USFMMarker ): # the line begins with an internal USFM Marker -- append it to the previous line text = element.text if text is None: text = '' if BibleOrgSysGlobals.debugFlag: print( _("{} {}:{} Found '\\{}' internal USFM marker at beginning of line with text: {}").format( self.BBB, C, V, USFMMarker, text ) ) #halt # Not checked yet if text: loadErrors.append( _("{} {}:{} Found '\\{}' internal USFM marker at beginning of line with text: {}").format( self.BBB, C, V, USFMMarker, text ) ) logging.warning( _("Found '\\{}' internal USFM Marker after {} {}:{} at beginning of line with text: {}").format( USFMMarker, self.BBB, C, V, text ) ) else: # no text loadErrors.append( _("{} {}:{} Found '\\{}' internal USFM Marker at beginning of line (with no text)").format( self.BBB, C, V, USFMMarker ) ) logging.warning( _("Found '\\{}' internal USFM Marker after {} {}:{} at beginning of line (with no text)").format( USFMMarker, self.BBB, C, V ) ) self.addPriorityError( 97, C, V, _("Found \\{} internal USFM Marker on new line in file").format( USFMMarker ) ) #lastText += '' if lastText.endswith(' ') else ' ' # Not always good to add a space, but it's their fault! lastText = '\\' + USFMMarker + ' ' + text #print( "{} {} {} Now have {}:{!r}".format( self.BBB, C, V, lastMarker, lastText ) ) else: # the line begins with an unknown USFM Marker try: status = element.attrib['status'] except KeyError: status = None text = element.text if text: loadErrors.append( _("{} {}:{} Found '\\{}' unknown USFM Marker at beginning of line with text: {}").format( self.BBB, C, V, USFMMarker, text ) ) logging.error( _("Found '\\{}' unknown USFM Marker after {} {}:{} at beginning of line with text: {}").format( USFMMarker, self.BBB, C, V, text ) ) else: # no text loadErrors.append( _("{} {}:{} Found '\\{}' unknown USFM Marker at beginning of line (with no text").format( self.BBB, C, V, USFMMarker ) ) logging.error( _("Found '\\{}' unknown USFM Marker after {} {}:{} at beginning of line (with no text)").format( USFMMarker, self.BBB, C, V ) ) self.addPriorityError( 100, C, V, _("Found \\{} unknown USFM Marker on new line in file").format( USFMMarker ) ) if status == 'unknown': # USX exporter already knew it was a bad marker pass # Just drop it completely else: for tryMarker in sortedNLMarkers: # Try to do something intelligent here -- it might be just a missing space if USFMMarker.startswith( tryMarker ): # Let's try changing it if lastMarker: self.addLine( lastMarker, lastText ) lastMarker, lastText = tryMarker, USFMMarker[len(tryMarker):] + ' ' + text loadErrors.append( _("{} {}:{} Changed '\\{}' unknown USFM Marker to {!r} at beginning of line: {}").format( self.BBB, C, V, USFMMarker, tryMarker, text ) ) logging.warning( _("Changed '\\{}' unknown USFM Marker to {!r} after {} {}:{} at beginning of line: {}").format( USFMMarker, tryMarker, self.BBB, C, V, text ) ) break # Otherwise, don't bother processing this line -- it'll just cause more problems later on else: logging.error( _("Unprocessed {} element after {} {}:{} in {}").format( element.tag, self.BBB, C, V, sublocation ) ) self.addPriorityError( 1, C, V, _("Unprocessed {} element").format( element.tag ) ) if loadErrors: self.errorDictionary['Load Errors'] = loadErrors
class OpenSongXMLBible(Bible): """ Class for reading, validating, and converting OpenSong Bible XML. """ treeTag = 'bible' bookTag = 'b' chapterTag = 'c' verseTag = 'v' def __init__(self, sourceFolder, givenName, encoding='utf-8'): """ Constructor: just sets up the XML Bible file converter object. """ # Setup and initialise the base class first if BibleOrgSysGlobals.debugFlag: print("OpenSongXMLBible( {}, {}, {} )".format( sourceFolder, givenName, encoding)) Bible.__init__(self) self.objectNameString = 'OpenSong XML Bible object' self.objectTypeString = 'OpenSong' # Now we can set our object variables self.sourceFolder, self.givenName, self.encoding = sourceFolder, givenName, encoding self.sourceFilepath = os.path.join(self.sourceFolder, self.givenName) self.tree = None # Will hold the XML data # Get the data tables that we need for proper checking #self.ISOLanguages = ISO_639_3_Languages().loadData() self.genericBOS = BibleOrganizationalSystem('GENERIC-KJV-66-ENG') # Do a preliminary check on the readability of our file if not os.access(self.sourceFilepath, os.R_OK): print("OpenSongXMLBible: File {!r} is unreadable".format( self.sourceFilepath)) self.name = self.givenName #if self.name is None: #pass # end of OpenSongXMLBible.__init__ def load(self): """ Load a single source XML file and load book elements. """ if BibleOrgSysGlobals.verbosityLevel > 2: print(_("Loading {}…").format(self.sourceFilepath)) self.tree = ElementTree().parse(self.sourceFilepath) if BibleOrgSysGlobals.debugFlag: assert len( self.tree) # Fail here if we didn't load anything at all # Find the main (bible) container if self.tree.tag == OpenSongXMLBible.treeTag: location = "XML file" BibleOrgSysGlobals.checkXMLNoText(self.tree, location, '4f6h') BibleOrgSysGlobals.checkXMLNoTail(self.tree, location, '1wk8') name = shortName = None for attrib, value in self.tree.items(): if attrib == "n": name = value elif attrib == "sn": shortName = value else: logging.warning( "Unprocessed {!r} attribute ({}) in main element". format(attrib, value)) # Find the submain (book) containers for element in self.tree: if element.tag == OpenSongXMLBible.bookTag: sublocation = "book in " + location BibleOrgSysGlobals.checkXMLNoText(element, sublocation, 'g3g5') BibleOrgSysGlobals.checkXMLNoTail(element, sublocation, 'd3f6') self.__validateAndExtractBook(element) elif element.tag == 'OT': pass elif element.tag == 'NT': pass else: logging.error("Expected to find {!r} but got {!r}".format( OpenSongXMLBible.bookTag, element.tag)) else: logging.error("Expected to load {!r} but got {!r}".format( OpenSongXMLBible.treeTag, self.tree.tag)) self.doPostLoadProcessing() # end of OpenSongXMLBible.load def __validateAndExtractBook(self, book): """ Check/validate and extract book data from the given XML book record finding chapter subelements. """ global BibleBooksNames if BibleOrgSysGlobals.verbosityLevel > 3: print(_("Validating OpenSong XML book…")) # Process the div attributes first BBB = bookName = None for attrib, value in book.items(): if attrib == "n": bookName = value else: logging.warning( "Unprocessed {!r} attribute ({}) in book element".format( attrib, value)) if bookName: BBB = self.genericBOS.getBBBFromText( bookName) # Booknames are usually in English if not BBB: # wasn't English if BibleBooksNames is None: BibleBooksNames = BibleBooksNamesSystems().loadData() BBB = BibleBooksNames.getBBBFromText( bookName) # Try non-English booknames #print( "bookName", bookName, BBB ) if BBB: if BibleOrgSysGlobals.verbosityLevel > 2: print(_("Validating {} {}…").format(BBB, bookName)) thisBook = BibleBook(self, BBB) thisBook.objectNameString = 'OpenSong XML Bible Book object' thisBook.objectTypeString = 'OpenSong' #thisBook.sourceFilepath = self.sourceFilepath USFMAbbreviation = BibleOrgSysGlobals.BibleBooksCodes.getUSFMAbbreviation( BBB) thisBook.addLine( 'id', '{} imported by {}'.format(USFMAbbreviation.upper(), ProgNameVersion)) thisBook.addLine('h', bookName) thisBook.addLine('mt1', bookName) for element in book: if element.tag == OpenSongXMLBible.chapterTag: sublocation = "chapter in {}".format(BBB) BibleOrgSysGlobals.checkXMLNoText( element, sublocation, 'j3jd') BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d') self.__validateAndExtractChapter( BBB, thisBook, element) else: logging.error( "Expected to find {!r} but got {!r}".format( OpenSongXMLBible.chapterTag, element.tag)) if BibleOrgSysGlobals.verbosityLevel > 2: print(" Saving {} into results…".format(BBB)) self.stashBook(thisBook) else: logging.error( _("OpenSong load doesn't recognize book name: {!r}"). format(bookName)) # no BBB else: logging.error( _("OpenSong load can't find a book name")) # no bookName # end of OpenSongXMLBible.__validateAndExtractBook def __validateAndExtractChapter(self, BBB, thisBook, chapter): """ Check/validate and extract chapter data from the given XML book record finding and saving chapter numbers and finding and saving verse elements. """ if BibleOrgSysGlobals.verbosityLevel > 3: print(_("Validating XML chapter…")) # Process the div attributes first chapterNumber = numVerses = None for attrib, value in chapter.items(): if attrib == "n": chapterNumber = value elif attrib == "VERSES": numVerses = value else: logging.warning( "Unprocessed {!r} attribute ({}) in chapter element". format(attrib, value)) if chapterNumber: #print( BBB, 'c', chapterNumber ) chapterNumber = chapterNumber.replace( 'of Solomon ', '') # Fix a mistake in the Chinese_SU module thisBook.addLine('c', chapterNumber) else: logging.error( "Missing 'n' attribute in chapter element for {}".format(BBB)) for element in chapter: if element.tag == OpenSongXMLBible.verseTag: sublocation = "verse in {} {}".format(BBB, chapterNumber) BibleOrgSysGlobals.checkXMLNoTail(element, sublocation, 'l5ks') verseNumber = toVerseNumber = None for attrib, value in element.items(): if attrib == "n": verseNumber = value elif attrib == "t": toVerseNumber = value else: logging.warning( "Unprocessed {!r} attribute ({}) in verse element". format(attrib, value)) if BibleOrgSysGlobals.debugFlag: assert verseNumber #thisBook.addLine( 'v', verseNumber ) vText = element.text if element.text else '' for subelement in element: sub2location = "{} in {}".format(subelement.tag, sublocation) BibleOrgSysGlobals.checkXMLNoAttributes( subelement, sub2location, 'ks03') BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sub2location, 'ks05') if subelement.tag == 'i': vText += '\\it {}\\it*{}'.format( subelement.text, subelement.tail) else: logging.error( "Expected to find 'i' but got {!r}".format( subelement.tag)) vText += element.tail if element.tail else '' if not vText: logging.warning("{} {}:{} has no text".format( BBB, chapterNumber, verseNumber)) #print( 'vText1', vText ) if vText: # This is the main text of the verse (follows the verse milestone) #print( "{} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, vText ) ) if '\n' in vText: # This is how they represent poety #print( "vText", repr(vText), repr(element.text) ) for j, textBit in enumerate(vText.split('\n')): if j == 0: thisBook.addLine('q1', '') thisBook.addLine('v', verseNumber + ' ' + textBit) else: thisBook.addLine('q1', textBit) else: # Just one verse line thisBook.addLine('v', verseNumber + ' ' + vText) #print( 'vText2', vText ) else: logging.error("Expected to find {!r} but got {!r}".format( OpenSongXMLBible.verseTag, element.tag))
class USFXXMLBible( Bible ): """ Class to load and manipulate USFX Bibles. """ def __init__( self, sourceFolder, givenName=None, encoding='utf-8' ): """ Create the internal USFX Bible object. """ # Setup and initialise the base class first Bible.__init__( self ) self.objectNameString = "USFX XML Bible object" self.objectTypeString = "USFX" self.sourceFolder, self.givenName, self.encoding = sourceFolder, givenName, encoding # Remember our parameters # Now we can set our object variables self.name = self.givenName if not self.name: self.name = os.path.basename( self.sourceFolder ) if not self.name: self.name = os.path.basename( self.sourceFolder[:-1] ) # Remove the final slash if not self.name: self.name = "USFX Bible" if self.name.endswith( '_usfx' ): self.name = self.name[:-5] # Remove end of name for Haiola projects # Do a preliminary check on the readability of our folder if not os.access( self.sourceFolder, os.R_OK ): logging.error( "USFXXMLBible: Folder {!r} is unreadable".format( self.sourceFolder ) ) # Do a preliminary check on the contents of our folder self.sourceFilename = self.sourceFilepath = None foundFiles, foundFolders = [], [] for something in os.listdir( self.sourceFolder ): somepath = os.path.join( self.sourceFolder, something ) if os.path.isdir( somepath ): foundFolders.append( something ) elif os.path.isfile( somepath ): somethingUpper = something.upper() somethingUpperProper, somethingUpperExt = os.path.splitext( somethingUpper ) ignore = False for ending in filenameEndingsToIgnore: if somethingUpper.endswith( ending): ignore=True; break if ignore: continue if not somethingUpperExt[1:] in extensionsToIgnore: # Compare without the first dot foundFiles.append( something ) else: logging.error( "Not sure what {!r} is in {}!".format( somepath, self.sourceFolder ) ) if foundFolders: logging.info( "USFXXMLBible: Surprised to see subfolders in {!r}: {}".format( self.sourceFolder, foundFolders ) ) if not foundFiles: if BibleOrgSysGlobals.verbosityLevel > 0: print( "USFXXMLBible: Couldn't find any files in {!r}".format( self.sourceFolder ) ) return # No use continuing #print( self.sourceFolder, foundFolders, len(foundFiles), foundFiles ) numFound = 0 for thisFilename in sorted( foundFiles ): firstLines = BibleOrgSysGlobals.peekIntoFile( thisFilename, sourceFolder, numLines=3 ) if not firstLines or len(firstLines)<2: continue if not firstLines[0].startswith( '<?xml version="1.0"' ) \ and not firstLines[0].startswith( '\ufeff<?xml version="1.0"' ): # same but with BOM if BibleOrgSysGlobals.verbosityLevel > 2: print( "USFXB (unexpected) first line was {!r} in {}".format( firstLines, thisFilename ) ) continue if "<usfx " not in firstLines[0]: continue lastFilenameFound = thisFilename numFound += 1 if numFound: if BibleOrgSysGlobals.verbosityLevel > 2: print( "USFXXMLBible got", numFound, sourceFolder, lastFilenameFound ) if numFound == 1: self.sourceFilename = lastFilenameFound self.sourceFilepath = os.path.join( self.sourceFolder, self.sourceFilename ) elif looksHopeful and BibleOrgSysGlobals.verbosityLevel > 2: print( " Looked hopeful but no actual files found" ) # end of USFXXMLBible.__init_ def load( self ): """ Load the XML data file -- we should already know the filepath. """ if BibleOrgSysGlobals.verbosityLevel > 1: print( _("USFXXMLBible: Loading {} from {}...").format( self.name, self.sourceFolder ) ) #if BibleOrgSysGlobals.verbosityLevel > 2: print( _(" It seems we have {}...").format( BBB ) ) #self.thisBook = BibleBook( self, BBB ) #self.thisBook.objectNameString = "OSIS XML Bible Book object" #self.thisBook.objectTypeString = "OSIS" #self.haveBook = True try: self.tree = ElementTree().parse( self.sourceFilepath ) except ParseError: errorString = sys.exc_info()[1] logging.critical( "USFXXMLBible.load: failed loading the xml file {}: {!r}.".format( self.sourceFilepath, errorString ) ) return if BibleOrgSysGlobals.debugFlag: assert( len ( self.tree ) ) # Fail here if we didn't load anything at all # Find the main (osis) container if self.tree.tag == 'usfx': location = "USFX file" BibleOrgSysGlobals.checkXMLNoText( self.tree, location, '4f6h' ) BibleOrgSysGlobals.checkXMLNoTail( self.tree, location, '1wk8' ) # Process the attributes first self.schemaLocation = None for attrib,value in self.tree.items(): #print( "attrib", repr(attrib), repr(value) ) if attrib.endswith("SchemaLocation"): self.schemaLocation = value else: logging.warning( "fv6g Unprocessed {} attribute ({}) in {}".format( attrib, value, location ) ) BBB = C = V = None for element in self.tree: #print( "element", repr(element.tag) ) sublocation = element.tag + " " + location if element.tag == 'languageCode': self.languageCode = element.text BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'cff3' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'des1' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, 'dwf2' ) elif element.tag == 'book': self.loadBook( element ) ##BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '54f2' ) #BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'hd35' ) ## Process the attributes #idField = bookStyle = None #for attrib,value in element.items(): #if attrib=='id' or attrib=='code': #idField = value # Should be USFM bookcode (not like BBB which is BibleOrgSys BBB bookcode) ##if idField != BBB: ## logging.warning( _("Unexpected book code ({}) in {}").format( idField, sublocation ) ) #elif attrib=='style': #bookStyle = value #else: #logging.warning( _("gfw2 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) ) else: logging.warning( _("dbw1 Unprocessed {} element after {} {}:{} in {}").format( element.tag, BBB, C, V, sublocation ) ) #self.addPriorityError( 1, c, v, _("Unprocessed {} element").format( element.tag ) ) if not self.books: # Didn't successfully load any regularly named books -- maybe the files have weird names??? -- try to be intelligent here if BibleOrgSysGlobals.verbosityLevel > 2: print( "USFXXMLBible.load: Didn't find any regularly named USFX files in {!r}".format( self.sourceFolder ) ) for thisFilename in foundFiles: # Look for BBB in the ID line (which should be the first line in a USFX file) isUSFX = False thisPath = os.path.join( self.sourceFolder, thisFilename ) with open( thisPath ) as possibleUSXFile: # Automatically closes the file when done for line in possibleUSXFile: if line.startswith( '\\id ' ): USXId = line[4:].strip()[:3] # Take the first three non-blank characters after the space after id if BibleOrgSysGlobals.verbosityLevel > 2: print( "Have possible USFX ID {!r}".format( USXId ) ) BBB = BibleOrgSysGlobals.BibleBooksCodes.getBBBFromUSFM( USXId ) if BibleOrgSysGlobals.verbosityLevel > 2: print( "BBB is {!r}".format( BBB ) ) isUSFX = True break # We only look at the first line if isUSFX: UBB = USFXXMLBibleBook( self, BBB ) UBB.load( self.sourceFolder, thisFilename, self.encoding ) UBB.validateMarkers() print( UBB ) self.books[BBB] = UBB # Make up our book name dictionaries while we're at it assumedBookNames = UBB.getAssumedBookNames() for assumedBookName in assumedBookNames: self.BBBToNameDict[BBB] = assumedBookName assumedBookNameLower = assumedBookName.lower() self.bookNameDict[assumedBookNameLower] = BBB # Store the deduced book name (just lower case) self.combinedBookNameDict[assumedBookNameLower] = BBB # Store the deduced book name (just lower case) if ' ' in assumedBookNameLower: self.combinedBookNameDict[assumedBookNameLower.replace(' ','')] = BBB # Store the deduced book name (lower case without spaces) if self.books: print( "USFXXMLBible.load: Found {} irregularly named USFX files".format( len(self.books) ) ) self.doPostLoadProcessing() # end of USFXXMLBible.load def loadBook( self, bookElement ): """ Load the book container from the XML data file. """ if BibleOrgSysGlobals.verbosityLevel > 3: print( _("USFXXMLBible.loadBook: Loading {} from {}...").format( self.name, self.sourceFolder ) ) assert( bookElement.tag == 'book' ) mainLocation = self.name + " USFX book" # Process the attributes first bookCode = None for attrib,value in bookElement.items(): if attrib == 'id': bookCode = value else: logging.warning( "bce3 Unprocessed {} attribute ({}) in {}".format( attrib, value, mainLocation ) ) BBB = BibleOrgSysGlobals.BibleBooksCodes.getBBBFromUSFM( bookCode ) mainLocation = "{} USFX {} book".format( self.name, BBB ) if BibleOrgSysGlobals.verbosityLevel > 2: print( _("USFXXMLBible.loadBook: Loading {} from {}...").format( BBB, self.name ) ) BibleOrgSysGlobals.checkXMLNoText( self.tree, mainLocation, '4f6h' ) BibleOrgSysGlobals.checkXMLNoTail( self.tree, mainLocation, '1wk8' ) # Now create our actual book self.thisBook = BibleBook( self, BBB ) self.thisBook.objectNameString = "USFX XML Bible Book object" self.thisBook.objectTypeString = "USFX" C = V = '0' for element in bookElement: #print( "element", repr(element.tag) ) location = "{} of {} {}:{}".format( element.tag, mainLocation, BBB, C, V ) if element.tag == 'id': idText = clean( element.text ) BibleOrgSysGlobals.checkXMLNoTail( element, location, 'vsg3' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'ksq2' ) for attrib,value in element.items(): if attrib == 'id': assert( value == bookCode ) else: logging.warning( _("vsg4 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) self.thisBook.addLine( 'id', bookCode + ((' '+idText) if idText else '') ) elif element.tag == 'ide': ideText = clean( element.text ) BibleOrgSysGlobals.checkXMLNoTail( element, location, 'jsa0' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'ls01' ) charset = None for attrib,value in element.items(): if attrib == 'charset': charset = value else: logging.warning( _("jx53 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) self.thisBook.addLine( 'ide', charset + ((' '+ideText) if ideText else '') ) elif element.tag == 'h': hText = element.text BibleOrgSysGlobals.checkXMLNoTail( element, location, 'dj35' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, location, 'hs35' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'hs32' ) self.thisBook.addLine( 'h', clean(hText) ) elif element.tag == 'toc': tocText = element.text BibleOrgSysGlobals.checkXMLNoTail( element, location, 'ss13' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'js13' ) level = None for attrib,value in element.items(): if attrib == 'level': # Seems compulsory level = value else: logging.warning( _("dg36 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) self.thisBook.addLine( 'toc'+level, clean(tocText) ) elif element.tag == 'c': BibleOrgSysGlobals.checkXMLNoText( element, location, 'ks35' ) BibleOrgSysGlobals.checkXMLNoTail( element, location, 'gs35' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'kdr3' ) # This is a milestone for attrib,value in element.items(): if attrib == 'id': C, V = value, '0' else: logging.warning( _("hj52 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) self.thisBook.addLine( 'c', C ) elif element.tag == 's': sText = clean( element.text ) BibleOrgSysGlobals.checkXMLNoTail( element, location, 'wxg0' ) level = None for attrib,value in element.items(): if attrib == 'level': # Seems optional level = value else: logging.warning( _("bdy6 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) marker = 's' if level: marker += level self.thisBook.addLine( marker, sText ) for subelement in element: #print( "subelement", repr(subelement.tag) ) sublocation = subelement.tag + " of " + location if subelement.tag == 'f': self.loadFootnote( subelement, sublocation, BBB, C, V ) elif subelement.tag == 'x': self.loadCrossreference( subelement, sublocation ) elif subelement.tag == 'fig': self.loadFigure( subelement, sublocation ) elif subelement.tag == 'table': self.loadTable( subelement, sublocation ) elif subelement.tag in ('add','it','bd','bdit','sc',): self.loadCharacterFormatting( subelement, sublocation, BBB, C, V ) elif subelement.tag == 'optionalLineBreak': print( "What is loadBook optionalLineBreak?" ) else: logging.warning( _("jx9q Unprocessed {} element after {} {}:{} in {}").format( subelement.tag, BBB, C, V, sublocation ) ) elif element.tag in ('p','q','d',): V = self.loadParagraph( element, location, BBB, C ) elif element.tag == 'b': BibleOrgSysGlobals.checkXMLNoText( element, location, 'ks35' ) BibleOrgSysGlobals.checkXMLNoTail( element, location, 'gs35' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, location, 'nd04' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'kdr3' ) self.thisBook.addLine( 'b', '' ) elif element.tag in ('cl','cp'): # Simple single-line paragraph-level markers marker, text = element.tag, clean(element.text) BibleOrgSysGlobals.checkXMLNoTail( element, location, 'od01' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'gd92' ) idField = None for attrib,value in element.items(): if attrib == 'id': idField = value else: logging.warning( _("dv35 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) if idField and text is None: text = idField else: logging.warning( _("dve4 Unprocessed idField ({}) in {}").format( idField, location ) ) if text is None: logging.critical( "Why is {} empty at {}".format( marker, location ) ) assert( text is not None ) self.thisBook.addLine( marker, text ) elif element.tag == 'table': self.loadTable( element, location ) elif element.tag == 've': # What's this in Psalms: <c id="4" /><ve /><d>For the Chief Musician; on stringed instruments. A Psalm of David.</d> BibleOrgSysGlobals.checkXMLNoText( element, location, 'kds3' ) BibleOrgSysGlobals.checkXMLNoTail( element, location, 'ks29' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, location, 'kj24' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'js91' ) #self.thisBook.addLine( 'b', '' ) if BibleOrgSysGlobals.verbosityLevel > 2: print( "Ignoring 've' field", BBB, C, V ) else: logging.critical( _("caf2 Unprocessed {} element after {} {}:{} in {}").format( element.tag, BBB, C, V, location ) ) #self.addPriorityError( 1, c, v, _("Unprocessed {} element").format( element.tag ) ) if BibleOrgSysGlobals.debugFlag and debuggingThisModule: halt self.saveBook( self.thisBook ) # end of USFXXMLBible.loadBook def loadParagraph( self, paragraphElement, paragraphLocation, BBB, C ): """ Load the paragraph (p or q) container from the XML data file. """ #if BibleOrgSysGlobals.verbosityLevel > 3: #print( _("USFXXMLBible.loadParagraph: Loading {} from {}...").format( self.name, self.sourceFolder ) ) V = None pTag, pText = paragraphElement.tag, clean(paragraphElement.text) BibleOrgSysGlobals.checkXMLNoTail( paragraphElement, paragraphLocation, 'vsg7' ) # Process the attributes first sfm = level = style = None for attrib,value in paragraphElement.items(): if attrib == 'sfm': sfm = value elif attrib == 'level': level = value elif attrib == 'style': style = value else: logging.warning( "vfh4 Unprocessed {} attribute ({}) in {}".format( attrib, value, paragraphLocation ) ) if sfm: assert( pTag == 'p' ) pTag = sfm if level: #assert( pTag == 'q' ) # Could also be mt, etc. pTag += level if style: #print( repr(pTag), repr(pText), repr(style) ) if BibleOrgSysGlobals.verbosityLevel > 2: print( "Ignoring {!r} style".format( style ) ) self.thisBook.addLine( pTag, '' if pText is None else pText ) for element in paragraphElement: location = element.tag + " of " + paragraphLocation #print( "element", repr(element.tag) ) if element.tag == 'v': # verse milestone vTail = clean( element.tail ) # Main verse text BibleOrgSysGlobals.checkXMLNoText( element, location, 'crc2' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'lct3' ) lastV, V = V, None for attrib,value in element.items(): if attrib == 'id': V = value else: logging.warning( _("cbs2 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) assert( V is not None ) assert( V ) self.thisBook.addLine( 'v', V + ((' '+vTail) if vTail else '' ) ) elif element.tag == 've': # verse end milestone -- we can just ignore this BibleOrgSysGlobals.checkXMLNoText( element, location, 'lsc3' ) BibleOrgSysGlobals.checkXMLNoTail( element, location, 'mfy4' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, location, 'bd24' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'ks35' ) elif element.tag == 'fig': self.loadFigure( element, location ) elif element.tag == 'table': self.loadTable( element, location ) elif element.tag == 'f': #print( "USFX.loadParagraph Found footnote at", paragraphLocation, C, V, repr(element.text) ) self.loadFootnote( element, location, BBB, C, V ) elif element.tag == 'x': #print( "USFX.loadParagraph Found xref at", paragraphLocation, C, V, repr(element.text) ) self.loadCrossreference( element, location ) elif element.tag in ('add','nd','wj','rq','sig','sls','bk','k','tl','vp','pn','qs','qt','em','it','bd','bdit','sc','no',): # character formatting self.loadCharacterFormatting( element, location, BBB, C, V ) elif element.tag == 'cs': # character style -- seems like a USFX hack text, tail = clean(element.text), clean(element.tail) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'kf92' ) sfm = None for attrib,value in element.items(): if attrib == 'sfm': sfm = value else: logging.warning( _("sh29 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) if sfm not in ('w','ior',): print( "cs sfm got", repr(sfm) ) self.thisBook.appendToLastLine( ' \\{} {}\\{}*{}'.format( sfm, text, sfm, (' '+tail) if tail else '' ) ) elif element.tag in ('cp',): # Simple single-line paragraph-level markers marker, text = element.tag, clean(element.text) BibleOrgSysGlobals.checkXMLNoTail( element, location, 'kdf0' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, location, 'lkj1' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'da13' ) self.thisBook.addLine( marker, text ) elif element.tag == 'ref': # encoded reference -- seems like a USFX hack text, tail = clean(element.text), clean(element.tail) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'bd83' ) target = None for attrib,value in element.items(): if attrib == 'tgt': target = value else: logging.warning( _("be83 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) #if target not in ('w','ior',): print( "ref sfm got", repr(sfm) ) self.thisBook.appendToLastLine( ' \\{} {}\\{}*{}{}'.format( element.tag, target, element.tag, text, (' '+tail) if tail else '' ) ) #print( "Saved", '\\{} {}\\{}*{}{}'.format( element.tag, target, element.tag, text, (' '+tail) if tail else '' ) ) elif element.tag == 'optionalLineBreak': print( "What is loadParagraph optionalLineBreak?" ) if BibleOrgSysGlobals.debugFlag: halt elif element.tag == 'milestone': # e.g., <milestone sfm="pb" attribute=""/> (pb = explicit page break) BibleOrgSysGlobals.checkXMLNoText( element, location, 'jzx2' ) BibleOrgSysGlobals.checkXMLNoTail( element, location, 'ms23' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'dw24' ) sfm = None for attrib,value in element.items(): if attrib == 'sfm': sfm = value else: logging.warning( _("mcd2 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) if sfm not in ('pb',): print( "milestone sfm got", repr(sfm) ) self.thisBook.addLine( sfm, '' ) else: logging.warning( _("df45 Unprocessed {} element after {} {}:{} in {}").format( repr(element.tag), self.thisBook.BBB, C, V, location ) ) return V # end of USFXXMLBible.loadParagraph def loadCharacterFormatting( self, element, location, BBB, C, V ): """ """ marker, text, tail = element.tag, clean(element.text), clean(element.tail) BibleOrgSysGlobals.checkXMLNoAttributes( element, location, 'sd12' ) self.thisBook.appendToLastLine( ' \\{} {}'.format( marker, text ) ) for subelement in element: sublocation = subelement.tag + " of " + location #print( "element", repr(element.tag) ) if subelement.tag == 'f': #print( "USFX.loadParagraph Found footnote at", sublocation, C, V, repr(subelement.text) ) self.loadFootnote( subelement, sublocation, BBB, C, V ) else: logging.warning( _("sf31 Unprocessed {} element after {} {}:{} in {}").format( repr(subelement.tag), self.thisBook.BBB, C, V, location ) ) if BibleOrgSysGlobals.debugFlag and debuggingThisModule: halt self.thisBook.appendToLastLine( '\\{}*{}'.format( marker, (' '+tail) if tail else '' ) ) # end of USFXXMLBible.loadCharacterFormatting def loadFigure( self, element, location ): """ """ BibleOrgSysGlobals.checkXMLNoText( element, location, 'ff36' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, location, 'cf35' ) figDict = { 'description':'', 'catalog':'', 'size':'', 'location':'', 'copyright':'', 'caption':'', 'reference':'' } for subelement in element: sublocation = subelement.tag + " of " + location figTag, figText = subelement.tag, clean(subelement.text) assert( figTag in figDict ) figDict[figTag] = '' if figText is None else figText BibleOrgSysGlobals.checkXMLNoTail( subelement, sublocation, 'jkf5' ) BibleOrgSysGlobals.checkXMLNoAttributes( subelement, sublocation, 'ld18' ) BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation, 'hb46' ) newString = '' for j,tag in enumerate( ('description', 'catalog', 'size', 'location', 'copyright', 'caption', 'reference',) ): newString += ('' if j==0 else '|') + figDict[tag] figTail = clean( element.tail ) self.thisBook.appendToLastLine( ' \\fig {}\\fig*{}'.format( newString, (' '+figTail) if figTail else '' ) ) # end of USFXXMLBible.loadFigure def loadTable( self, element, location ): """ """ BibleOrgSysGlobals.checkXMLNoText( element, location, 'kg92' ) BibleOrgSysGlobals.checkXMLNoTail( element, location, 'ka92' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, location, 'ks63' ) for subelement in element: sublocation = subelement.tag + " of " + location if subelement.tag == 'tr': #print( "table", sublocation ) self.thisBook.addLine( 'tr', '' ) BibleOrgSysGlobals.checkXMLNoText( subelement, sublocation, 'sg32' ) BibleOrgSysGlobals.checkXMLNoTail( subelement, sublocation, 'dh82' ) BibleOrgSysGlobals.checkXMLNoAttributes( subelement, sublocation, 'mniq' ) for sub2element in subelement: sub2location = sub2element.tag + " of " + sublocation tag, text = sub2element.tag, clean(sub2element.text) assert( tag in ('th', 'thr', 'tc', 'tcr',) ) BibleOrgSysGlobals.checkXMLNoTail( sub2element, sub2location, 'ah82' ) BibleOrgSysGlobals.checkXMLNoSubelements( sub2element, sub2location, 'ka63' ) level = None for attrib,value in sub2element.items(): if attrib == 'level': level = value else: logging.warning( _("vx25 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) marker = tag + (level if level else '') self.thisBook.appendToLastLine( ' \\{} {}'.format( marker, text ) ) else: logging.warning( _("kv64 Unprocessed {} element after {} {}:{} in {}").format( subelement.tag, self.thisBook.BBB, C, V, sublocation ) ) # end of USFXXMLBible.loadTable def loadFootnote( self, element, location, BBB, C, V ): """ Handles footnote fields, including xt field. """ text, tail = clean(element.text), clean(element.tail) caller = None for attrib,value in element.items(): if attrib == 'caller': caller = value else: logging.warning( _("dg35 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) self.thisBook.appendToLastLine( ' \\f {}{}'.format( caller, (' '+text) if text else '' ) ) for subelement in element: sublocation = subelement.tag + " of " + location marker, fText, fTail = subelement.tag, clean(subelement.text), clean(subelement.tail) #print( "USFX.loadFootnote", repr(caller), repr(text), repr(tail), repr(marker), repr(fText), repr(fTail) ) #if BibleOrgSysGlobals.verbosityLevel > 0 and marker not in ('ref','fr','ft','fq','fv','fk','fqa','it','bd','rq',): #print( "USFX.loadFootnote found", repr(caller), repr(marker), repr(fText), repr(fTail) ) if BibleOrgSysGlobals.debugFlag: assert( marker in ('ref','fr','ft','fq','fv','fk','fqa','it','bd','rq','xt',) ) if marker=='ref': assert( fText ) BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation, 'ls13' ) target = None for attrib,value in subelement.items(): if attrib == 'tgt': target = value else: logging.warning( _("gs35 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) ) if target: self.thisBook.appendToLastLine( ' \\{} {}\\{}*{}'.format( marker, target, marker, fText ) ) else: halt else: BibleOrgSysGlobals.checkXMLNoAttributes( subelement, sublocation, 'dq54' ) self.thisBook.appendToLastLine( ' \\{} {}'.format( marker, fText ) ) if marker=='xt' or marker[0]=='f': # Starts with f, e.g., fr, ft for sub2element in subelement: sub2location = sub2element.tag + " of " + sublocation marker2, fText2, fTail2 = sub2element.tag, clean(sub2element.text), clean(sub2element.tail) BibleOrgSysGlobals.checkXMLNoSubelements( sub2element, sub2location, 'js72' ) if marker2 == 'ref': #print( sub2location ) if fText2: #print( 'ft2', marker2, repr(fText2), repr(fTail2), sub2location ) self.thisBook.appendToLastLine( fText2 ) target = None for attrib,value in sub2element.items(): if attrib == 'tgt': target = value # OSIS style reference, e.g., '1SA.27.8' else: logging.warning( _("hd52 Unprocessed {} attribute ({}) in {}").format( attrib, value, sub2location ) ) if target: #print( 'tg', marker2, repr(target) ) self.thisBook.appendToLastLine( ' \\{} {}'.format( marker2, target ) ) else: if debuggingThisModule: halt elif marker2 in ('add','nd','wj','rq','sig','sls','bk','k','tl','vp','pn','qs','qt','em','it','bd','bdit','sc','no',): # character formatting self.loadCharacterFormatting( sub2element, sub2location, BBB, C, V ) else: print( 'Ignored marker2', repr(marker2), BBB, C, V ) if debuggingThisModule: halt if fTail2: self.thisBook.appendToLastLine( fTail2 ) elif marker in ('add','nd','wj','rq','sig','sls','bk','k','tl','vp','pn','qs','qt','em','it','bd','bdit','sc','no',): # character formatting self.loadCharacterFormatting( subelement, sublocation, BBB, C, V ) else: print( 'Ignored marker', repr(marker), BBB, C, V ) halt if fTail: self.thisBook.appendToLastLine( '\\{}*{}'.format( marker, fTail ) ) self.thisBook.appendToLastLine( '\\f*{}'.format( (' '+tail) if tail else '' ) ) # end of USFXXMLBible.loadFootnote def loadCrossreference( self, element, location ): """ Has to handle: <x caller="+"><ref tgt="EXO.30.12">Exodus 30:12</ref></x> """ text, tail = clean(element.text), clean(element.tail) caller = None for attrib,value in element.items(): if attrib == 'caller': caller = value else: logging.warning( _("fhj2 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) self.thisBook.appendToLastLine( ' \\x {}'.format( caller ) ) for subelement in element: sublocation = subelement.tag + " of " + location marker, xText, xTail = subelement.tag, clean(subelement.text), clean(subelement.tail) #print( "USFX.loadCrossreference", repr(caller), repr(text), repr(tail), repr(marker), repr(xText), repr(xTail) ) #if BibleOrgSysGlobals.verbosityLevel > 0 and marker not in ('ref','xo','xt',): #print( "USFX.loadCrossreference found", repr(caller), repr(marker), repr(xText), repr(xTail) ) if BibleOrgSysGlobals.debugFlag: assert( marker in ('ref','xo','xt',) ) if marker=='ref': assert( xText ) BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation, 's1sd' ) target = None for attrib,value in subelement.items(): if attrib == 'tgt': target = value else: logging.warning( _("aj41 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) ) if target: self.thisBook.appendToLastLine( ' \\{} {}\\{}*{}'.format( marker, target, marker, xText ) ) else: halt else: BibleOrgSysGlobals.checkXMLNoAttributes( subelement, sublocation, 'sc35' ) self.thisBook.appendToLastLine( ' \\{} {}'.format( marker, xText ) ) if marker[0] == 'x': # Starts with x, e.g., xo, xt for sub2element in subelement: sub2location = sub2element.tag + " of " + sublocation marker2, xText2, xTail2 = sub2element.tag, clean(sub2element.text), clean(sub2element.tail) BibleOrgSysGlobals.checkXMLNoSubelements( sub2element, sub2location, 'fs63' ) if marker2=='ref': if xText2: #print( 'xt2', marker2, repr(xText2), repr(xTail2), sub2location ) self.thisBook.appendToLastLine( xText2 ) target = None for attrib,value in sub2element.items(): if attrib == 'tgt': target = value else: logging.warning( _("gs34 Unprocessed {} attribute ({}) in {}").format( attrib, value, sub2location ) ) if target: self.thisBook.appendToLastLine( ' \\{} {}'.format( marker2, target ) ) else: halt else: halt if xTail2: self.thisBook.appendToLastLine( xTail2 ) else: halt if xTail: self.thisBook.appendToLastLine( '\\{}*{}'.format( marker, xTail ) ) self.thisBook.appendToLastLine( '\\x*{}'.format( (' '+tail) if tail else '' ) )
class OpenSongXMLBible( Bible ): """ Class for reading, validating, and converting OpenSong Bible XML. """ treeTag = 'bible' bookTag = 'b' chapterTag = 'c' verseTag = 'v' def __init__( self, sourceFolder, givenName, encoding='utf-8' ): """ Constructor: just sets up the XML Bible file converter object. """ # Setup and initialise the base class first if BibleOrgSysGlobals.debugFlag: print( "OpenSongXMLBible( {}, {}, {} )".format( sourceFolder, givenName, encoding ) ) Bible.__init__( self ) self.objectNameString = "OpenSong XML Bible object" self.objectTypeString = "OpenSong" # Now we can set our object variables self.sourceFolder, self.givenName, self.encoding = sourceFolder, givenName, encoding self.sourceFilepath = os.path.join( self.sourceFolder, self.givenName ) self.tree = None # Will hold the XML data # Get the data tables that we need for proper checking #self.ISOLanguages = ISO_639_3_Languages().loadData() self.genericBOS = BibleOrganizationalSystem( "GENERIC-KJV-66-ENG" ) # Do a preliminary check on the readability of our file if not os.access( self.sourceFilepath, os.R_OK ): print( "OpenSongXMLBible: File {!r} is unreadable".format( self.sourceFilepath ) ) self.name = self.givenName #if self.name is None: #pass # end of OpenSongXMLBible.__init__ def load( self ): """ Load a single source XML file and load book elements. """ if BibleOrgSysGlobals.verbosityLevel > 2: print( _("Loading {}...").format( self.sourceFilepath ) ) self.tree = ElementTree().parse( self.sourceFilepath ) if BibleOrgSysGlobals.debugFlag: assert( len ( self.tree ) ) # Fail here if we didn't load anything at all # Find the main (bible) container if self.tree.tag == OpenSongXMLBible.treeTag: location = "XML file" BibleOrgSysGlobals.checkXMLNoText( self.tree, location, '4f6h' ) BibleOrgSysGlobals.checkXMLNoTail( self.tree, location, '1wk8' ) name = shortName = None for attrib,value in self.tree.items(): if attrib=="n": name = value elif attrib=="sn": shortName = value else: logging.warning( "Unprocessed {!r} attribute ({}) in main element".format( attrib, value ) ) # Find the submain (book) containers for element in self.tree: if element.tag == OpenSongXMLBible.bookTag: sublocation = "book in " + location BibleOrgSysGlobals.checkXMLNoText( element, sublocation, 'g3g5' ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'd3f6' ) self.__validateAndExtractBook( element ) elif element.tag == 'OT': pass elif element.tag == 'NT': pass else: logging.error( "Expected to find {!r} but got {!r}".format( OpenSongXMLBible.bookTag, element.tag ) ) else: logging.error( "Expected to load {!r} but got {!r}".format( OpenSongXMLBible.treeTag, self.tree.tag ) ) self.doPostLoadProcessing() # end of OpenSongXMLBible.load def __validateAndExtractBook( self, book ): """ Check/validate and extract book data from the given XML book record finding chapter subelements. """ if BibleOrgSysGlobals.verbosityLevel > 3: print( _("Validating OpenSong XML book...") ) # Process the div attributes first BBB = bookName = None for attrib,value in book.items(): if attrib=="n": bookName = value else: logging.warning( "Unprocessed {!r} attribute ({}) in book element".format( attrib, value ) ) if bookName: BBB = self.genericBOS.getBBB( bookName ) # Booknames are in English if BBB: if BibleOrgSysGlobals.verbosityLevel > 2: print( _("Validating {} {}...").format( BBB, bookName ) ) thisBook = BibleBook( self, BBB ) thisBook.objectNameString = "OpenSong XML Bible Book object" thisBook.objectTypeString = "OpenSong" #thisBook.sourceFilepath = self.sourceFilepath USFMAbbreviation = BibleOrgSysGlobals.BibleBooksCodes.getUSFMAbbreviation( BBB ) thisBook.addLine( 'id', '{} imported by {}'.format( USFMAbbreviation.upper(), ProgNameVersion ) ) thisBook.addLine( 'h', bookName ) thisBook.addLine( 'mt1', bookName ) for element in book: if element.tag == OpenSongXMLBible.chapterTag: sublocation = "chapter in {}".format( BBB ) BibleOrgSysGlobals.checkXMLNoText( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) self.__validateAndExtractChapter( BBB, thisBook, element ) else: logging.error( "Expected to find {!r} but got {!r}".format( OpenSongXMLBible.chapterTag, element.tag ) ) if BibleOrgSysGlobals.verbosityLevel > 2: print( " Saving {} into results...".format( BBB ) ) self.saveBook( thisBook ) else: logging.error( _("OpenSong load doesn't recognize book name: {!r}").format( bookName ) ) # no BBB else: logging.error( _("OpenSong load can't find a book name") ) # no bookName # end of OpenSongXMLBible.__validateAndExtractBook def __validateAndExtractChapter( self, BBB, thisBook, chapter ): """ Check/validate and extract chapter data from the given XML book record finding and saving chapter numbers and finding and saving verse elements. """ if BibleOrgSysGlobals.verbosityLevel > 3: print( _("Validating XML chapter...") ) # Process the div attributes first chapterNumber = numVerses = None for attrib,value in chapter.items(): if attrib=="n": chapterNumber = value elif attrib=="VERSES": numVerses = value else: logging.warning( "Unprocessed {!r} attribute ({}) in chapter element".format( attrib, value ) ) if chapterNumber: #print( BBB, 'c', chapterNumber ) chapterNumber = chapterNumber.replace( 'of Solomon ', '' ) # Fix a mistake in the Chinese_SU module thisBook.addLine( 'c', chapterNumber ) else: logging.error( "Missing 'n' attribute in chapter element for BBB".format( BBB ) ) for element in chapter: if element.tag == OpenSongXMLBible.verseTag: sublocation = "verse in {} {}".format( BBB, chapterNumber ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'l5ks' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5f7h' ) verseNumber = toVerseNumber = None for attrib,value in element.items(): if attrib=="n": verseNumber = value elif attrib=="t": toVerseNumber = value else: logging.warning( "Unprocessed {!r} attribute ({}) in verse element".format( attrib, value ) ) if BibleOrgSysGlobals.debugFlag: assert( verseNumber ) #thisBook.addLine( 'v', verseNumber ) vText = element.text if not vText: logging.warning( "{} {}:{} has no text".format( BBB, chapterNumber, verseNumber ) ) if vText: # This is the main text of the verse (follows the verse milestone) #print( "{} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, vText ) ) if '\n' in vText: # This is how they represent poety #print( "vText", repr(vText), repr(element.text) ) for j, textBit in enumerate( vText.split( '\n' ) ): if j==0: thisBook.addLine( 'q1', '' ) thisBook.addLine( 'v', verseNumber + ' ' + textBit ) else: thisBook.addLine( 'q1', textBit ) else: # Just one verse line thisBook.addLine( 'v', verseNumber + ' ' + vText ) else: logging.error( "Expected to find {!r} but got {!r}".format( OpenSongXMLBible.verseTag, element.tag ) )
class USFXXMLBible( Bible ): """ Class to load and manipulate USFX Bibles. """ def __init__( self, sourceFolder, givenName=None, encoding='utf-8' ): """ Create the internal USFX Bible object. """ # Setup and initialise the base class first Bible.__init__( self ) self.objectNameString = "USFX XML Bible object" self.objectTypeString = "USFX" self.sourceFolder, self.givenName, self.encoding = sourceFolder, givenName, encoding # Remember our parameters # Now we can set our object variables self.name = self.givenName if not self.name: self.name = os.path.basename( self.sourceFolder ) if not self.name: self.name = os.path.basename( self.sourceFolder[:-1] ) # Remove the final slash if not self.name: self.name = "USFX Bible" if self.name.endswith( '_usfx' ): self.name = self.name[:-5] # Remove end of name for Haiola projects # Do a preliminary check on the readability of our folder if not os.access( self.sourceFolder, os.R_OK ): logging.error( "USFXXMLBible: Folder '{}' is unreadable".format( self.sourceFolder ) ) # Do a preliminary check on the contents of our folder self.sourceFilename = self.sourceFilepath = None foundFiles, foundFolders = [], [] for something in os.listdir( self.sourceFolder ): somepath = os.path.join( self.sourceFolder, something ) if os.path.isdir( somepath ): foundFolders.append( something ) elif os.path.isfile( somepath ): somethingUpper = something.upper() somethingUpperProper, somethingUpperExt = os.path.splitext( somethingUpper ) ignore = False for ending in filenameEndingsToIgnore: if somethingUpper.endswith( ending): ignore=True; break if ignore: continue if not somethingUpperExt[1:] in extensionsToIgnore: # Compare without the first dot foundFiles.append( something ) else: logging.error( "Not sure what '{}' is in {}!".format( somepath, self.sourceFolder ) ) if foundFolders: logging.info( "USFXXMLBible: Surprised to see subfolders in '{}': {}".format( self.sourceFolder, foundFolders ) ) if not foundFiles: if Globals.verbosityLevel > 0: print( "USFXXMLBible: Couldn't find any files in '{}'".format( self.sourceFolder ) ) return # No use continuing #print( self.sourceFolder, foundFolders, len(foundFiles), foundFiles ) numFound = 0 for thisFilename in sorted( foundFiles ): firstLines = Globals.peekIntoFile( thisFilename, sourceFolder, numLines=3 ) if not firstLines or len(firstLines)<2: continue if not firstLines[0].startswith( '<?xml version="1.0"' ) \ and not firstLines[0].startswith( '\ufeff<?xml version="1.0"' ): # same but with BOM if Globals.verbosityLevel > 2: print( "USFXB (unexpected) first line was '{}' in {}".format( firstLines, thisFilename ) ) continue if "<usfx " not in firstLines[0]: continue lastFilenameFound = thisFilename numFound += 1 if numFound: if Globals.verbosityLevel > 2: print( "USFXXMLBible got", numFound, sourceFolder, lastFilenameFound ) if numFound == 1: self.sourceFilename = lastFilenameFound self.sourceFilepath = os.path.join( self.sourceFolder, self.sourceFilename ) elif looksHopeful and Globals.verbosityLevel > 2: print( " Looked hopeful but no actual files found" ) # end of USFXXMLBible.__init_ def load( self ): """ Load the XML data file -- we should already know the filepath. """ if Globals.verbosityLevel > 1: print( _("USFXXMLBible: Loading {} from {}...").format( self.name, self.sourceFolder ) ) #if Globals.verbosityLevel > 2: print( _(" It seems we have {}...").format( BBB ) ) #self.thisBook = BibleBook( self, BBB ) #self.thisBook.objectNameString = "OSIS XML Bible Book object" #self.thisBook.objectTypeString = "OSIS" #self.haveBook = True try: self.tree = ElementTree().parse( self.sourceFilepath ) except ParseError: errorString = sys.exc_info()[1] logging.critical( "USFXXMLBible.load: failed loading the xml file {}: '{}'.".format( self.sourceFilepath, errorString ) ) return if Globals.debugFlag: assert( len ( self.tree ) ) # Fail here if we didn't load anything at all # Find the main (osis) container if self.tree.tag == 'usfx': location = "USFX file" Globals.checkXMLNoText( self.tree, location, '4f6h' ) Globals.checkXMLNoTail( self.tree, location, '1wk8' ) # Process the attributes first self.schemaLocation = None for attrib,value in self.tree.items(): #print( "attrib", repr(attrib), repr(value) ) if attrib.endswith("SchemaLocation"): self.schemaLocation = value else: logging.warning( "fv6g Unprocessed {} attribute ({}) in {}".format( attrib, value, location ) ) BBB = C = V = None for element in self.tree: #print( "element", repr(element.tag) ) sublocation = element.tag + " " + location if element.tag == 'languageCode': self.languageCode = element.text Globals.checkXMLNoTail( element, sublocation, 'cff3' ) Globals.checkXMLNoAttributes( element, sublocation, 'des1' ) Globals.checkXMLNoSubelements( element, sublocation, 'dwf2' ) elif element.tag == 'book': self.loadBook( element ) ##Globals.checkXMLNoSubelements( element, sublocation, '54f2' ) #Globals.checkXMLNoTail( element, sublocation, 'hd35' ) ## Process the attributes #idField = bookStyle = None #for attrib,value in element.items(): #if attrib=='id' or attrib=='code': #idField = value # Should be USFM bookcode (not like BBB which is BibleOrgSys BBB bookcode) ##if idField != BBB: ## logging.warning( _("Unexpected book code ({}) in {}").format( idField, sublocation ) ) #elif attrib=='style': #bookStyle = value #else: #logging.warning( _("gfw2 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) ) else: logging.warning( _("dbw1 Unprocessed {} element after {} {}:{} in {}").format( element.tag, BBB, C, V, sublocation ) ) #self.addPriorityError( 1, c, v, _("Unprocessed {} element").format( element.tag ) ) if not self.books: # Didn't successfully load any regularly named books -- maybe the files have weird names??? -- try to be intelligent here if Globals.verbosityLevel > 2: print( "USFXXMLBible.load: Didn't find any regularly named USFX files in '{}'".format( self.sourceFolder ) ) for thisFilename in foundFiles: # Look for BBB in the ID line (which should be the first line in a USFX file) isUSFX = False thisPath = os.path.join( self.sourceFolder, thisFilename ) with open( thisPath ) as possibleUSXFile: # Automatically closes the file when done for line in possibleUSXFile: if line.startswith( '\\id ' ): USXId = line[4:].strip()[:3] # Take the first three non-blank characters after the space after id if Globals.verbosityLevel > 2: print( "Have possible USFX ID '{}'".format( USXId ) ) BBB = Globals.BibleBooksCodes.getBBBFromUSFM( USXId ) if Globals.verbosityLevel > 2: print( "BBB is '{}'".format( BBB ) ) isUSFX = True break # We only look at the first line if isUSFX: UBB = USFXXMLBibleBook( self, BBB ) UBB.load( self.sourceFolder, thisFilename, self.encoding ) UBB.validateMarkers() print( UBB ) self.books[BBB] = UBB # Make up our book name dictionaries while we're at it assumedBookNames = UBB.getAssumedBookNames() for assumedBookName in assumedBookNames: self.BBBToNameDict[BBB] = assumedBookName assumedBookNameLower = assumedBookName.lower() self.bookNameDict[assumedBookNameLower] = BBB # Store the deduced book name (just lower case) self.combinedBookNameDict[assumedBookNameLower] = BBB # Store the deduced book name (just lower case) if ' ' in assumedBookNameLower: self.combinedBookNameDict[assumedBookNameLower.replace(' ','')] = BBB # Store the deduced book name (lower case without spaces) if self.books: print( "USFXXMLBible.load: Found {} irregularly named USFX files".format( len(self.books) ) ) self.doPostLoadProcessing() # end of USFXXMLBible.load def loadBook( self, bookElement ): """ Load the book container from the XML data file. """ if Globals.verbosityLevel > 3: print( _("USFXXMLBible.loadBook: Loading {} from {}...").format( self.name, self.sourceFolder ) ) assert( bookElement.tag == 'book' ) mainLocation = self.name + " USFX book" # Process the attributes first bookCode = None for attrib,value in bookElement.items(): if attrib == 'id': bookCode = value else: logging.warning( "bce3 Unprocessed {} attribute ({}) in {}".format( attrib, value, mainLocation ) ) BBB = Globals.BibleBooksCodes.getBBBFromUSFM( bookCode ) mainLocation = "{} USFX {} book".format( self.name, BBB ) if Globals.verbosityLevel > 2: print( _("USFXXMLBible.loadBook: Loading {} from {}...").format( BBB, self.name ) ) Globals.checkXMLNoText( self.tree, mainLocation, '4f6h' ) Globals.checkXMLNoTail( self.tree, mainLocation, '1wk8' ) # Now create our actual book self.thisBook = BibleBook( self, BBB ) self.thisBook.objectNameString = "USFX XML Bible Book object" self.thisBook.objectTypeString = "USFX" C = V = '0' for element in bookElement: #print( "element", repr(element.tag) ) location = "{} of {} {}:{}".format( element.tag, mainLocation, C, V ) if element.tag == 'id': idText = clean( element.text ) Globals.checkXMLNoTail( element, location, 'vsg3' ) Globals.checkXMLNoSubelements( element, location, 'ksq2' ) for attrib,value in element.items(): if attrib == 'id': assert( value == bookCode ) else: logging.warning( _("vsg4 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) self.thisBook.appendLine( 'id', bookCode + ((' '+idText) if idText else '') ) elif element.tag == 'ide': ideText = clean( element.text ) Globals.checkXMLNoTail( element, location, 'jsa0' ) Globals.checkXMLNoSubelements( element, location, 'ls01' ) charset = None for attrib,value in element.items(): if attrib == 'charset': charset = value else: logging.warning( _("jx53 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) self.thisBook.appendLine( 'ide', charset + ((' '+ideText) if ideText else '') ) elif element.tag == 'h': hText = element.text Globals.checkXMLNoTail( element, location, 'dj35' ) Globals.checkXMLNoAttributes( element, location, 'hs35' ) Globals.checkXMLNoSubelements( element, location, 'hs32' ) self.thisBook.appendLine( 'h', clean(hText) ) elif element.tag == 'toc': tocText = element.text Globals.checkXMLNoTail( element, location, 'ss13' ) Globals.checkXMLNoSubelements( element, location, 'js13' ) level = None for attrib,value in element.items(): if attrib == 'level': # Seems compulsory level = value else: logging.warning( _("dg36 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) self.thisBook.appendLine( 'toc'+level, clean(tocText) ) elif element.tag == 'c': Globals.checkXMLNoText( element, location, 'ks35' ) Globals.checkXMLNoTail( element, location, 'gs35' ) Globals.checkXMLNoSubelements( element, location, 'kdr3' ) # This is a milestone for attrib,value in element.items(): if attrib == 'id': C, V = value, '0' else: logging.warning( _("hj52 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) self.thisBook.appendLine( 'c', C ) elif element.tag == 's': sText = clean( element.text ) Globals.checkXMLNoTail( element, location, 'wxg0' ) level = None for attrib,value in element.items(): if attrib == 'level': # Seems optional level = value else: logging.warning( _("bdy6 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) marker = 's' if level: marker += level self.thisBook.appendLine( marker, sText ) for subelement in element: #print( "subelement", repr(subelement.tag) ) sublocation = subelement.tag + " of " + location if subelement.tag == 'f': self.loadFootnote( subelement, sublocation ) elif subelement.tag == 'x': self.loadCrossreference( subelement, sublocation ) elif subelement.tag == 'fig': self.loadFigure( subelement, sublocation ) elif subelement.tag == 'table': self.loadTable( subelement, sublocation ) elif subelement.tag in ('add','it','bd','bdit','sc',): self.loadCharacterFormatting( subelement, sublocation ) elif subelement.tag == 'optionalLineBreak': print( "What is loadBook optionalLineBreak?" ) else: logging.warning( _("jx9q Unprocessed {} element after {} {}:{} in {}").format( subelement.tag, BBB, C, V, sublocation ) ) elif element.tag in ('p','q','d',): V = self.loadParagraph( element, location, C ) elif element.tag == 'b': Globals.checkXMLNoText( element, location, 'ks35' ) Globals.checkXMLNoTail( element, location, 'gs35' ) Globals.checkXMLNoAttributes( element, location, 'nd04' ) Globals.checkXMLNoSubelements( element, location, 'kdr3' ) self.thisBook.appendLine( 'b', '' ) elif element.tag in ('cl','cp'): # Simple single-line paragraph-level markers marker, text = element.tag, clean(element.text) Globals.checkXMLNoTail( element, location, 'od01' ) Globals.checkXMLNoAttributes( element, location, 'us91' ) Globals.checkXMLNoSubelements( element, location, 'gd92' ) self.thisBook.appendLine( marker, text ) elif element.tag == 'table': self.loadTable( element, location ) else: logging.critical( _("caf2 Unprocessed {} element after {} {}:{} in {}").format( element.tag, BBB, C, V, location ) ) #self.addPriorityError( 1, c, v, _("Unprocessed {} element").format( element.tag ) ) if Globals.debugFlag: halt self.saveBook( self.thisBook ) # end of USFXXMLBible.loadBook def loadParagraph( self, paragraphElement, paragraphLocation, C ): """ Load the paragraph (p or q) container from the XML data file. """ #if Globals.verbosityLevel > 3: #print( _("USFXXMLBible.loadParagraph: Loading {} from {}...").format( self.name, self.sourceFolder ) ) V = None pText = paragraphElement.text Globals.checkXMLNoTail( paragraphElement, paragraphLocation, 'vsg7' ) # Process the attributes first sfm = level = style = None for attrib,value in paragraphElement.items(): if attrib == 'sfm': sfm = value elif attrib == 'level': level = value elif attrib == 'style': style = value else: logging.warning( "vfh4 Unprocessed {} attribute ({}) in {}".format( attrib, value, paragraphLocation ) ) for element in paragraphElement: location = element.tag + " of " + paragraphLocation #print( "element", repr(element.tag) ) if element.tag == 'v': # verse milestone vTail = clean( element.tail ) # Main verse text Globals.checkXMLNoText( element, location, 'crc2' ) Globals.checkXMLNoSubelements( element, location, 'lct3' ) lastV, V = V, None for attrib,value in element.items(): if attrib == 'id': V = value else: logging.warning( _("cbs2 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) assert( V is not None ) assert( V ) self.thisBook.appendLine( 'v', V + ((' '+vTail) if vTail else '' ) ) elif element.tag == 've': # verse end milestone -- we can just ignore this Globals.checkXMLNoText( element, location, 'lsc3' ) Globals.checkXMLNoTail( element, location, 'mfy4' ) Globals.checkXMLNoAttributes( element, location, 'bd24' ) Globals.checkXMLNoSubelements( element, location, 'ks35' ) elif element.tag == 'fig': self.loadFigure( element, location ) elif element.tag == 'table': self.loadTable( element, location ) elif element.tag == 'f': #print( "USFX.loadParagraph Found footnote at", paragraphLocation, C, V, repr(element.text) ) self.loadFootnote( element, location ) elif element.tag == 'x': #print( "USFX.loadParagraph Found xref at", paragraphLocation, C, V, repr(element.text) ) self.loadCrossreference( element, location ) elif element.tag in ('add','nd','wj','rq','sig','sls','bk','k','tl','vp','pn','qs','qt','em','it','bd','bdit','sc','no',): # character formatting self.loadCharacterFormatting( element, location ) elif element.tag == 'cs': # character style -- seems like a USFX hack text, tail = clean(element.text), clean(element.tail) Globals.checkXMLNoSubelements( element, location, 'kf92' ) sfm = None for attrib,value in element.items(): if attrib == 'sfm': sfm = value else: logging.warning( _("sh29 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) if sfm not in ('w','ior',): print( "cs sfm got", repr(sfm) ) self.thisBook.appendToLastLine( ' \\{} {}\\{}*{}'.format( sfm, text, sfm, (' '+tail) if tail else '' ) ) elif element.tag in ('cp',): # Simple single-line paragraph-level markers marker, text = element.tag, clean(element.text) Globals.checkXMLNoTail( element, location, 'kdf0' ) Globals.checkXMLNoAttributes( element, location, 'lkj1' ) Globals.checkXMLNoSubelements( element, location, 'da13' ) self.thisBook.appendLine( marker, text ) elif element.tag == 'ref': # encoded reference -- seems like a USFX hack text, tail = clean(element.text), clean(element.tail) Globals.checkXMLNoSubelements( element, location, 'bd83' ) target = None for attrib,value in element.items(): if attrib == 'tgt': target = value else: logging.warning( _("be83 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) #if target not in ('w','ior',): print( "ref sfm got", repr(sfm) ) self.thisBook.appendToLastLine( ' \\{} {}\\{}*{}{}'.format( element.tag, target, element.tag, text, (' '+tail) if tail else '' ) ) #print( "Saved", '\\{} {}\\{}*{}{}'.format( element.tag, target, element.tag, text, (' '+tail) if tail else '' ) ) elif element.tag == 'optionalLineBreak': print( "What is loadParagraph optionalLineBreak?" ) if Globals.debugFlag: halt elif element.tag == 'milestone': print( "What is loadParagraph milestone?" ) if Globals.debugFlag: halt else: logging.warning( _("df45 Unprocessed {} element after {} {}:{} in {}").format( repr(element.tag), self.thisBook.BBB, C, V, location ) ) return V # end of USFXXMLBible.loadParagraph def loadCharacterFormatting( self, element, location ): """ """ marker, text, tail = element.tag, clean(element.text), clean(element.tail) Globals.checkXMLNoAttributes( element, location, 'sd12' ) self.thisBook.appendToLastLine( ' \\{} {}'.format( marker, text ) ) for subelement in element: sublocation = subelement.tag + " of " + location #print( "element", repr(element.tag) ) if subelement.tag == 'f': #print( "USFX.loadParagraph Found footnote at", sublocation, C, V, repr(subelement.text) ) self.loadFootnote( subelement, sublocation ) else: logging.warning( _("sf31 Unprocessed {} element after {} {}:{} in {}").format( repr(subelement.tag), self.thisBook.BBB, C, V, location ) ) halt self.thisBook.appendToLastLine( '\\{}*{}'.format( marker, (' '+tail) if tail else '' ) ) # end of USFXXMLBible.loadCharacterFormatting def loadFigure( self, element, location ): """ """ Globals.checkXMLNoText( element, location, 'ff36' ) Globals.checkXMLNoAttributes( element, location, 'cf35' ) figDict = { 'description':'', 'catalog':'', 'size':'', 'location':'', 'copyright':'', 'caption':'', 'reference':'' } for subelement in element: sublocation = subelement.tag + " of " + location figTag, figText = subelement.tag, clean(subelement.text) assert( figTag in figDict ) figDict[figTag] = '' if figText is None else figText Globals.checkXMLNoTail( subelement, sublocation, 'jkf5' ) Globals.checkXMLNoAttributes( subelement, sublocation, 'ld18' ) Globals.checkXMLNoSubelements( subelement, sublocation, 'hb46' ) newString = '' for j,tag in enumerate( ('description', 'catalog', 'size', 'location', 'copyright', 'caption', 'reference',) ): newString += ('' if j==0 else '|') + figDict[tag] figTail = clean( element.tail ) self.thisBook.appendToLastLine( ' \\fig {}\\fig*{}'.format( newString, (' '+figTail) if figTail else '' ) ) # end of USFXXMLBible.loadFigure def loadTable( self, element, location ): """ """ Globals.checkXMLNoText( element, location, 'kg92' ) Globals.checkXMLNoTail( element, location, 'ka92' ) Globals.checkXMLNoAttributes( element, location, 'ks63' ) for subelement in element: sublocation = subelement.tag + " of " + location if subelement.tag == 'tr': #print( "table", sublocation ) self.thisBook.appendLine( 'tr', '' ) Globals.checkXMLNoText( subelement, sublocation, 'sg32' ) Globals.checkXMLNoTail( subelement, sublocation, 'dh82' ) Globals.checkXMLNoAttributes( subelement, sublocation, 'mniq' ) for sub2element in subelement: sub2location = sub2element.tag + " of " + sublocation tag, text = sub2element.tag, clean(sub2element.text) assert( tag in ('th', 'thr', 'tc', 'tcr',) ) Globals.checkXMLNoTail( sub2element, sub2location, 'ah82' ) Globals.checkXMLNoSubelements( sub2element, sub2location, 'ka63' ) level = None for attrib,value in sub2element.items(): if attrib == 'level': level = value else: logging.warning( _("vx25 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) marker = tag + (level if level else '') self.thisBook.appendToLastLine( ' \\{} {}'.format( marker, text ) ) else: logging.warning( _("kv64 Unprocessed {} element after {} {}:{} in {}").format( subelement.tag, self.thisBook.BBB, C, V, sublocation ) ) # end of USFXXMLBible.loadTable def loadFootnote( self, element, location ): """ """ text, tail = clean(element.text), clean(element.tail) caller = None for attrib,value in element.items(): if attrib == 'caller': caller = value else: logging.warning( _("dg35 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) self.thisBook.appendToLastLine( ' \\f {}{}'.format( caller, (' '+text) if text else '' ) ) for subelement in element: sublocation = subelement.tag + " of " + location marker, fText, fTail = subelement.tag, clean(subelement.text), clean(subelement.tail) #print( "USFX.loadFootnote", repr(caller), repr(text), repr(tail), repr(marker), repr(fText), repr(fTail) ) #if Globals.verbosityLevel > 0 and marker not in ('ref','fr','ft','fq','fv','fk','fqa','it','bd','rq',): #print( "USFX.loadFootnote found", repr(caller), repr(marker), repr(fText), repr(fTail) ) if Globals.debugFlag: assert( marker in ('ref','fr','ft','fq','fv','fk','fqa','it','bd','rq',) ) if marker=='ref': assert( fText ) Globals.checkXMLNoSubelements( subelement, sublocation, 'ls13' ) target = None for attrib,value in subelement.items(): if attrib == 'tgt': target = value else: logging.warning( _("gs35 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) ) if target: self.thisBook.appendToLastLine( ' \\{} {}\\{}*{}'.format( marker, target, marker, fText ) ) else: halt else: Globals.checkXMLNoAttributes( subelement, sublocation, 'dq54' ) self.thisBook.appendToLastLine( ' \\{} {}'.format( marker, fText ) ) if marker[0] == 'f': # Starts with f, e.g., fr, ft for sub2element in subelement: sub2location = sub2element.tag + " of " + sublocation marker2, fText2, fTail2 = sub2element.tag, clean(sub2element.text), clean(sub2element.tail) Globals.checkXMLNoSubelements( sub2element, sub2location, 'js72' ) if marker2=='ref': print( sub2location ) assert( not fText2 ) target = None for attrib,value in sub2element.items(): if attrib == 'tgt': target = value else: logging.warning( _("hd52 Unprocessed {} attribute ({}) in {}").format( attrib, value, sub2location ) ) if target: self.thisBook.appendToLastLine( ' \\{} {}'.format( marker2, target ) ) else: halt else: halt else: halt if fTail: self.thisBook.appendToLastLine( '\\{}*{}'.format( marker, fTail ) ) self.thisBook.appendToLastLine( '\\f*{}'.format( (' '+tail) if tail else '' ) ) # end of USFXXMLBible.loadFootnote def loadCrossreference( self, element, location ): """ Has to handle: <x caller="+"><ref tgt="EXO.30.12">Exodus 30:12</ref></x> """ text, tail = clean(element.text), clean(element.tail) caller = None for attrib,value in element.items(): if attrib == 'caller': caller = value else: logging.warning( _("fhj2 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) self.thisBook.appendToLastLine( ' \\x {}'.format( caller ) ) for subelement in element: sublocation = subelement.tag + " of " + location marker, xText, xTail = subelement.tag, clean(subelement.text), clean(subelement.tail) #print( "USFX.loadCrossreference", repr(caller), repr(text), repr(tail), repr(marker), repr(xText), repr(xTail) ) #if Globals.verbosityLevel > 0 and marker not in ('ref','xo','xt',): #print( "USFX.loadCrossreference found", repr(caller), repr(marker), repr(xText), repr(xTail) ) if Globals.debugFlag: assert( marker in ('ref','xo','xt',) ) if marker=='ref': assert( xText ) Globals.checkXMLNoSubelements( subelement, sublocation, 's1sd' ) target = None for attrib,value in subelement.items(): if attrib == 'tgt': target = value else: logging.warning( _("aj41 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) ) if target: self.thisBook.appendToLastLine( ' \\{} {}\\{}*{}'.format( marker, target, marker, xText ) ) else: halt else: Globals.checkXMLNoAttributes( subelement, sublocation, 'sc35' ) self.thisBook.appendToLastLine( ' \\{} {}'.format( marker, xText ) ) if marker[0] == 'x': # Starts with x, e.g., xo, xt for sub2element in subelement: sub2location = sub2element.tag + " of " + sublocation marker2, xText2, xTail2 = sub2element.tag, clean(sub2element.text), clean(sub2element.tail) Globals.checkXMLNoSubelements( sub2element, sub2location, 'fs63' ) if marker2=='ref': assert( not xText2 ) target = None for attrib,value in sub2element.items(): if attrib == 'tgt': target = value else: logging.warning( _("gs34 Unprocessed {} attribute ({}) in {}").format( attrib, value, sub2location ) ) if target: self.thisBook.appendToLastLine( ' \\{} {}'.format( marker2, target ) ) else: halt else: halt else: halt if xTail: self.thisBook.appendToLastLine( '\\{}*{}'.format( marker, xTail ) ) self.thisBook.appendToLastLine( '\\x*{}'.format( (' '+tail) if tail else '' ) )
class OpenSongXMLBible(Bible): """ Class for reading, validating, and converting OpenSong Bible XML. """ treeTag = 'bible' bookTag = 'b' chapterTag = 'c' verseTag = 'v' def __init__(self, sourceFolder, givenName, encoding='utf-8'): """ Constructor: just sets up the XML Bible file converter object. """ # Setup and initialise the base class first if Globals.debugFlag: print("OpenSongXMLBible( {}, {}, {} )".format( sourceFolder, givenName, encoding)) Bible.__init__(self) self.objectNameString = "OpenSong XML Bible object" self.objectTypeString = "OpenSong" # Now we can set our object variables self.sourceFolder, self.givenName, self.encoding = sourceFolder, givenName, encoding self.sourceFilepath = os.path.join(self.sourceFolder, self.givenName) self.tree = None # Will hold the XML data # Get the data tables that we need for proper checking #self.ISOLanguages = ISO_639_3_Languages().loadData() self.genericBOS = BibleOrganizationalSystem("GENERIC-KJV-66-ENG") # Do a preliminary check on the readability of our file if not os.access(self.sourceFilepath, os.R_OK): print("OpenSongXMLBible: File '{}' is unreadable".format( self.sourceFilepath)) self.name = self.givenName #if self.name is None: #pass # end of OpenSongXMLBible.__init__ def load(self): """ Load a single source XML file and load book elements. """ if Globals.verbosityLevel > 2: print(_("Loading {}...").format(self.sourceFilepath)) self.tree = ElementTree().parse(self.sourceFilepath) if Globals.debugFlag: assert (len(self.tree) ) # Fail here if we didn't load anything at all # Find the main (bible) container if self.tree.tag == OpenSongXMLBible.treeTag: location = "XML file" Globals.checkXMLNoText(self.tree, location, '4f6h') Globals.checkXMLNoTail(self.tree, location, '1wk8') name = shortName = None for attrib, value in self.tree.items(): if attrib == "n": name = value elif attrib == "sn": shortName = value else: logging.warning( "Unprocessed '{}' attribute ({}) in main element". format(attrib, value)) # Find the submain (book) containers for element in self.tree: if element.tag == OpenSongXMLBible.bookTag: sublocation = "book in " + location Globals.checkXMLNoText(element, sublocation, 'g3g5') Globals.checkXMLNoTail(element, sublocation, 'd3f6') self.__validateAndExtractBook(element) elif element.tag == 'OT': pass elif element.tag == 'NT': pass else: logging.error("Expected to find '{}' but got '{}'".format( OpenSongXMLBible.bookTag, element.tag)) else: logging.error("Expected to load '{}' but got '{}'".format( OpenSongXMLBible.treeTag, self.tree.tag)) # end of OpenSongXMLBible.load def __validateAndExtractBook(self, book): """ Check/validate and extract book data from the given XML book record finding chapter subelements. """ if Globals.verbosityLevel > 3: print(_("Validating OpenSong XML book...")) # Process the div attributes first BBB = bookName = None for attrib, value in book.items(): if attrib == "n": bookName = value else: logging.warning( "Unprocessed '{}' attribute ({}) in book element".format( attrib, value)) if bookName: BBB = self.genericBOS.getBBB(bookName) if BBB: if Globals.verbosityLevel > 2: print(_("Validating {} {}...").format(BBB, bookName)) thisBook = BibleBook(self.name, BBB) thisBook.objectNameString = "OpenSong XML Bible Book object" thisBook.objectTypeString = "OpenSong" #thisBook.sourceFilepath = self.sourceFilepath for element in book: if element.tag == OpenSongXMLBible.chapterTag: sublocation = "chapter in {}".format(BBB) Globals.checkXMLNoText(element, sublocation, 'j3jd') Globals.checkXMLNoTail(element, sublocation, 'al1d') self.__validateAndExtractChapter( BBB, thisBook, element) else: logging.error( "Expected to find '{}' but got '{}'".format( OpenSongXMLBible.chapterTag, element.tag)) if Globals.verbosityLevel > 2: print(" Saving {} into results...".format(BBB)) self.saveBook(thisBook) logging.error( _("OpenSong load doesn't recognize book name: '{}'").format( bookName)) logging.error(_("OpenSong load can't find a book name")) # end of OpenSongXMLBible.__validateAndExtractBook def __validateAndExtractChapter(self, BBB, thisBook, chapter): """ Check/validate and extract chapter data from the given XML book record finding and saving chapter numbers and finding and saving verse elements. """ if Globals.verbosityLevel > 3: print(_("Validating XML chapter...")) # Process the div attributes first chapterNumber = numVerses = None for attrib, value in chapter.items(): if attrib == "n": chapterNumber = value elif attrib == "VERSES": numVerses = value else: logging.warning( "Unprocessed '{}' attribute ({}) in chapter element". format(attrib, value)) if chapterNumber: #print( BBB, 'c', chapterNumber ) chapterNumber = chapterNumber.replace( 'of Solomon ', '') # Fix a mistake in the Chinese_SU module thisBook.appendLine('c', chapterNumber) else: logging.error( "Missing 'n' attribute in chapter element for BBB".format(BBB)) for element in chapter: if element.tag == OpenSongXMLBible.verseTag: sublocation = "verse in {} {}".format(BBB, chapterNumber) Globals.checkXMLNoTail(element, sublocation, 'l5ks') Globals.checkXMLNoSubelements(element, sublocation, '5f7h') verseNumber = toVerseNumber = None for attrib, value in element.items(): if attrib == "n": verseNumber = value elif attrib == "t": toVerseNumber = value else: logging.warning( "Unprocessed '{}' attribute ({}) in verse element". format(attrib, value)) if Globals.debugFlag: assert (verseNumber) #thisBook.appendLine( 'v', verseNumber ) vText = element.text if not vText: logging.warning("{} {}:{} has no text".format( BBB, chapterNumber, verseNumber)) if vText: # This is the main text of the verse (follows the verse milestone) #print( "{} {}:{} '{}'".format( BBB, chapterNumber, verseNumber, vText ) ) if '\n' in vText: # This is how they represent poety #print( "vText", repr(vText), repr(element.text) ) for j, textBit in enumerate(vText.split('\n')): if j == 0: thisBook.appendLine('q1', '') thisBook.appendLine( 'v', verseNumber + ' ' + textBit) else: thisBook.appendLine('q1', textBit) else: # Just one verse line thisBook.appendLine('v', verseNumber + ' ' + vText) else: logging.error("Expected to find '{}' but got '{}'".format( OpenSongXMLBible.verseTag, element.tag))