def load(self): """ Load a single source file and load book elements. """ if BibleOrgSysGlobals.verbosityLevel > 2: print(_("Loading {}…").format(self.sourceFilepath)) global BOS66, BOS81, BOSx if BOS66 is None: BOS66 = BibleOrganizationalSystem('GENERIC-KJV-66-ENG') if BOS81 is None: BOS81 = BibleOrganizationalSystem('GENERIC-KJV-80-ENG') if BOSx is None: BOSx = BibleOrganizationalSystem('GENERIC-ENG') if self.suppliedMetadata is None: self.suppliedMetadata = {} lastLine, lineCount = '', 0 bookCode = BBB = metadataName = None lastBookCode = lastChapterNumber = lastVerseNumber = -1 lastVText = '' thisBook = None settingsDict = {} with open(self.sourceFilepath, encoding=self.encoding ) as myFile: # Automatically closes the file when done for line in myFile: lineCount += 1 if line[-1] == '\n': line = line[:-1] # Removing trailing newline character if not line: continue # Just discard blank lines if lineCount == 1: if self.encoding.lower() == 'utf-8' and line[0] == chr( 65279): #U+FEFF or \ufeff logging.info( " ForgeForSwordSearcherBible.load: Detected Unicode Byte Order Marker (BOM)" ) line = line[ 1:] # Remove the Unicode Byte Order Marker (BOM) match = re.search('^; TITLE:\\s', line) if match: if BibleOrgSysGlobals.debugFlag: print("First line got type {!r} match from {!r}". format(match.group(0), line)) else: if BibleOrgSysGlobals.verbosityLevel > 3: print( "ForgeForSwordSearcherBible.load: (unexpected) first line was {!r} in {}" .format(firstLine, thisFilename)) if BibleOrgSysGlobals.debugFlag and debuggingThisModule: halt continue #print ( 'ForgeForSwordSearcher file line is "' + line + '"' ) lastLine = line # Process header stuff if line.startswith('; TITLE:'): string = line[8:].strip() if string: settingsDict['TITLE'] = string continue elif line.startswith('; ABBREVIATION:'): string = line[15:].strip() if string: settingsDict['ABBREVIATION'] = string continue elif line.startswith('; HAS ITALICS'): string = line[14:].strip() if string: settingsDict['HAS_ITALICS'] = string continue elif line.startswith('; HAS FOOTNOTES:'): string = line[15:].strip() if string: settingsDict['HAS_FOOTNOTES'] = string continue elif line.startswith('; HAS FOOTNOTES'): string = line[14:].strip() if string: settingsDict['HAS_FOOTNOTES'] = string continue elif line.startswith('; HAS REDLETTER'): string = line[14:].strip() if string: settingsDict['HAS_REDLETTER'] = string continue elif line[0] == ';': logging.warning( "ForgeForSwordSearcherBible.load is skipping unknown header/comment line: {}" .format(line)) continue # Just discard comment lines # Process the main segment if line.startswith('$$ '): if metadataName and metadataContents: settingsDict[metadataName] = metadataContents metadataName = None pointer = line[3:] #print( "pointer", repr(pointer) ) if pointer and pointer[0] == '{' and pointer[-1] == '}': metadataName = pointer[1:-1] if metadataName: #print( "metadataName", repr(metadataName) ) metadataContents = '' else: # let's assume it's a BCV reference pointer = pointer.replace( '1 K','1K' ).replace( '2 K','2K' ) \ .replace( '1 Chr','1Chr' ).replace( '2 Chr','2Chr' ) \ .replace( '1 Cor','1Cor' ).replace( '2 Cor','2Cor' ) \ .replace( '1 Thess','1Thess' ).replace( '2 Thess','2Thess' ) \ .replace( '1 Tim','1Tim' ).replace( '2 Tim','2Tim' ) \ .replace( '1 Pet','1Pet' ).replace( '2 Pet','2Pet' ) \ .replace( '1 J','1J' ).replace( '2 J','2J' ).replace( '3 J','3J' ) B_CV_Bits = pointer.split(' ', 1) if len(B_CV_Bits) == 2 and ':' in B_CV_Bits[1]: bookCode, CVString = B_CV_Bits chapterNumberString, verseNumberString = CVString.split( ':') chapterNumber = int(chapterNumberString) verseNumber = int(verseNumberString) if bookCode != lastBookCode: # We've started a new book if bookCode in ('Ge', ): BBB = 'GEN' elif bookCode in ('Le', ): BBB = 'LEV' elif bookCode in ('La', ): BBB = 'LAM' ##elif bookCode in ('Es',): BBB = 'EST' ##elif bookCode in ('Pr',): BBB = 'PRO' #elif bookCode in ('So',): BBB = 'SNG' #elif bookCode in ('La',): BBB = 'LAM' #elif bookCode in ('Jude',): BBB = 'JDE' else: #print( "4BookCode =", repr(bookCode) ) #BBB = BOS.getBBBFromText( bookCode ) # Try to guess BBB = BOS66.getBBBFromText( bookCode) # Try to guess if not BBB: BBB = BOS81.getBBBFromText( bookCode) # Try to guess if not BBB: BBB = BOSx.getBBBFromText( bookCode) # Try to guess #print( "4BBB =", repr(BBB) ) else: print("Unexpected number of bits", self.givenName, BBB, bookCode, chapterNumberString, verseNumberString, len(bits), bits) continue # Just save the pointer information which refers to the text on the next line else: # it's not a $$ line text = line #print( "text", repr(text) ) if metadataName: metadataContents += ('\n' if metadataContents else '') + text continue else: vText = text # Handle bits like (<scripref>Pr 2:7</scripref>) vText = vText.replace('(<scripref>', '\\x - \\xt ').replace( '</scripref>)', '\\x*') vText = vText.replace('<scripref>', '\\x - \\xt ').replace( '</scripref>', '\\x*') #if '\\' in vText: print( 'ForgeForSwordSearcher vText', repr(vText) ) #print( BBB, chapterNumber, verseNumber, repr(vText) ) # Convert {stuff} to footnotes match = re.search('\\{(.+?)\\}', vText) while match: footnoteText = '\\f + \\fr {}:{} \\ft {}\\f*'.format( chapterNumber, verseNumber, match.group(1)) vText = vText[:match.start( )] + footnoteText + vText[ match.end():] # Replace this footnote #print( BBB, chapterNumber, verseNumber, repr(vText) ) match = re.search('\\{(.+?)\\}', vText) # Convert [stuff] to added fields match = re.search('\\[(.+?)\\]', vText) while match: addText = '\\add {}\\add*'.format(match.group(1)) vText = vText[:match.start()] + addText + vText[ match.end():] # Replace this chunk #print( BBB, chapterNumber, verseNumber, repr(vText) ) match = re.search('\\[(.+?)\\]', vText) # Convert +r/This text is red-letter-r/ to wj fields match = re.search('\\+r/(.+?)-r/', vText) while match: addText = '\\wj {}\\wj*'.format(match.group(1)) vText = vText[:match.start()] + addText + vText[ match.end():] # Replace this chunk #print( BBB, chapterNumber, verseNumber, repr(vText) ) match = re.search('\\+r/(.+?)-r/', vText) # Final check for unexpected remaining formatting for badChar in '{}[]/': if badChar in vText: logging.warning( "Found remaining braces,brackets or slashes in SwordSearcher Forge VPL {} {}:{} {!r}" .format(BBB, chapterNumberString, verseNumberString, vText)) break if bookCode: if bookCode != lastBookCode: # We've started a new book if lastBookCode != -1: # Better save the last book self.stashBook(thisBook) if BBB: if BBB in self: logging.critical( "Have duplicated {} book in {}".format( self.givenName, BBB)) if BibleOrgSysGlobals.debugFlag: assert BBB not in self thisBook = BibleBook(self, BBB) thisBook.objectNameString = 'ForgeForSwordSearcher Bible Book object' thisBook.objectTypeString = 'ForgeForSwordSearcher' verseList = BOSx.getNumVersesList(BBB) numChapters, numVerses = len( verseList), verseList[0] lastBookCode = bookCode lastChapterNumber = lastVerseNumber = -1 else: logging.critical( "ForgeForSwordSearcherBible could not figure out {!r} book code" .format(bookCode)) if BibleOrgSysGlobals.debugFlag: halt if BBB: if chapterNumber != lastChapterNumber: # We've started a new chapter if BibleOrgSysGlobals.debugFlag: assert chapterNumber > lastChapterNumber or BBB == 'ESG' # Esther Greek might be an exception if chapterNumber == 0: logging.info( "Have chapter zero in {} {} {} {}:{}". format(self.givenName, BBB, bookCode, chapterNumberString, verseNumberString)) elif chapterNumber > numChapters: logging.error( "Have high chapter number in {} {} {} {}:{} (expected max of {})" .format(self.givenName, BBB, bookCode, chapterNumberString, verseNumberString, numChapters)) thisBook.addLine('c', chapterNumberString) lastChapterNumber = chapterNumber lastVerseNumber = -1 # Handle the verse info if verseNumber == lastVerseNumber and vText == lastVText: logging.warning( _("Ignored duplicate verse line in {} {} {} {}:{}" ).format(self.givenName, BBB, bookCode, chapterNumberString, verseNumberString)) continue if verseNumber < lastVerseNumber: logging.warning( _("Ignored receding verse number (from {} to {}) in {} {} {} {}:{}" ).format(lastVerseNumber, verseNumber, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString)) elif verseNumber == lastVerseNumber: if vText == lastVText: logging.warning( _("Ignored duplicated {} verse in {} {} {} {}:{}" ).format(verseNumber, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString)) else: logging.warning( _("Ignored duplicated {} verse number in {} {} {} {}:{}" ).format(verseNumber, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString)) # Check for paragraph markers if vText and vText[0] == '¶': thisBook.addLine('p', '') vText = vText[1:].lstrip() #print( '{} {}:{} = {!r}'.format( BBB, chapterNumberString, verseNumberString, vText ) ) thisBook.addLine('v', verseNumberString + ' ' + vText) lastVText = vText lastVerseNumber = verseNumber else: # No bookCode yet logging.warning( "ForgeForSwordSearcherBible.load is skipping unknown pre-book line: {}" .format(line)) # Save the final book if thisBook is not None: self.stashBook(thisBook) # Clean up if settingsDict: #print( "ForgeForSwordSearcher settingsDict", settingsDict ) if self.suppliedMetadata is None: self.suppliedMetadata = {} self.suppliedMetadata['Forge4SS'] = settingsDict self.applySuppliedMetadata( 'Forge4SS') # Copy some to self.settingsDict self.doPostLoadProcessing()
class VerseViewXMLBible( Bible ): """ Class for reading, validating, and converting VerseViewXMLBible XML. """ XMLNameSpace = "{http://www.w3.org/2001/XMLSchema-instance}" treeTag = 'bible' filenameTag = 'fname' revisionTag = 'revision' titleTag = 'title' fontTag = 'font' copyrightTag = 'copyright' sizefactorTag = 'sizefactor' bookTag = 'b' chapterTag = 'c' verseTag = 'v' def __init__( self, sourceFolder, givenName, encoding='utf-8' ): """ Constructor: just sets up the VerseView Bible object. """ # Setup and initialise the base class first Bible.__init__( self ) self.objectNameString = 'VerseView XML Bible object' self.objectTypeString = 'VerseView' # Now we can set our object variables self.sourceFolder, self.givenName, self.encoding = sourceFolder, givenName, encoding self.sourceFilepath = os.path.join( self.sourceFolder, self.givenName ) self.tree = self.header = None # Will hold the XML data # Get the data tables that we need for proper checking #self.ISOLanguages = ISO_639_3_Languages().loadData() self.genericBOS = BibleOrganizationalSystem( 'GENERIC-KJV-66-ENG' ) # Do a preliminary check on the readability of our file if not os.access( self.sourceFilepath, os.R_OK ): print( "VerseViewXMLBible: File {!r} is unreadable".format( self.sourceFilepath ) ) self.name = self.givenName #if self.name is None: #pass # end of VerseViewXMLBible.__init__ def load( self ): """ Load a single source XML file and load book elements. """ if BibleOrgSysGlobals.verbosityLevel > 2: print( _("Loading {}…").format( self.sourceFilepath ) ) self.tree = ElementTree().parse( self.sourceFilepath ) if BibleOrgSysGlobals.debugFlag: assert len ( self.tree ) # Fail here if we didn't load anything at all if self.suppliedMetadata is None: self.suppliedMetadata = {} self.suppliedMetadata['VerseView'] = {} # Find the main (bible) container if self.tree.tag == VerseViewXMLBible.treeTag: location = "VerseView XML file" BibleOrgSysGlobals.checkXMLNoText( self.tree, location, '4f6h' ) BibleOrgSysGlobals.checkXMLNoAttributes( self.tree, location, 'js24' ) BibleOrgSysGlobals.checkXMLNoTail( self.tree, location, '1wk8' ) # Find the submain (various info and then book) containers bookNumber = 0 for element in self.tree: if element.tag == VerseViewXMLBible.filenameTag: sublocation = "filename in " + location BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'jk86' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, 'hjk7' ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'bh09' ) #self.filename = element.text elif element.tag == VerseViewXMLBible.revisionTag: sublocation = "revision in " + location BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'jk86' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, 'hjk7' ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'bh09' ) self.suppliedMetadata['VerseView']['Revision'] = element.text elif element.tag == VerseViewXMLBible.titleTag: sublocation = "title in " + location BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'jk86' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, 'hjk7' ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'bh09' ) self.suppliedMetadata['VerseView']['Title'] = element.text elif element.tag == VerseViewXMLBible.fontTag: sublocation = "font in " + location BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'jk86' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, 'hjk7' ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'bh09' ) self.suppliedMetadata['VerseView']['Font'] = element.text elif element.tag == VerseViewXMLBible.copyrightTag: sublocation = "copyright in " + location BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'jk86' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, 'hjk7' ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'bh09' ) self.suppliedMetadata['VerseView']['Copyright'] = element.text elif element.tag == VerseViewXMLBible.sizefactorTag: sublocation = "sizefactor in " + location BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'jk86' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, 'hjk7' ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'bh09' ) if BibleOrgSysGlobals.debugFlag: assert element.text == '1' elif element.tag == VerseViewXMLBible.bookTag: sublocation = "book in " + location BibleOrgSysGlobals.checkXMLNoText( element, sublocation, 'g3g5' ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'd3f6' ) bookNumber += 1 self.__validateAndExtractBook( element, bookNumber ) else: logging.error( "xk15 Expected to find {!r} but got {!r}".format( VerseViewXMLBible.bookTag, element.tag ) ) else: logging.error( "Expected to load {!r} but got {!r}".format( VerseViewXMLBible.treeTag, self.tree.tag ) ) if BibleOrgSysGlobals.debugFlag or BibleOrgSysGlobals.verbosityLevel > 2: # These are all compulsory so they should all exist #print( "Filename is {!r}".format( self.filename ) ) print( "Revision is {!r}".format( self.suppliedMetadata['VerseView']['Revision'] ) ) print( "Title is {!r}".format( self.suppliedMetadata['VerseView']['Title'] ) ) print( "Font is {!r}".format( self.suppliedMetadata['VerseView']['Font'] ) ) print( "Copyright is {!r}".format( self.suppliedMetadata['VerseView']['Copyright'] ) ) #print( "SizeFactor is {!r}".format( self.sizeFactor ) ) self.applySuppliedMetadata( 'VerseView' ) # Copy some to self.settingsDict self.doPostLoadProcessing() # end of VerseViewXMLBible.load def __validateAndExtractBook( self, book, bookNumber ): """ Check/validate and extract book data from the given XML book record finding chapter subelements. """ if BibleOrgSysGlobals.verbosityLevel > 3: print( _("Validating XML book…") ) # Process the div attributes first BBB = bookName = None for attrib,value in book.items(): if attrib=="n": bookName = value else: logging.warning( "Unprocessed {!r} attribute ({}) in book element".format( attrib, value ) ) if bookName: BBB = self.genericBOS.getBBBFromText( bookName ) if BBB is None: adjustedBookName = BibleOrgSysGlobals.removeAccents( bookName ) if adjustedBookName != bookName: BBB = self.genericBOS.getBBBFromText( adjustedBookName ) BBB2 = BibleOrgSysGlobals.BibleBooksCodes.getBBBFromReferenceNumber( bookNumber ) if BBB2 != BBB: # Just double check using the book number if BibleOrgSysGlobals.debugFlag or BibleOrgSysGlobals.verbosityLevel > 2: print( "Assuming that book {} {!r} is {} (not {})".format( bookNumber, bookName, BBB2, BBB ) ) BBB = BBB2 #print( BBB ); halt if BBB: if BibleOrgSysGlobals.verbosityLevel > 2: print( _("Validating {} {}…").format( BBB, bookName ) ) thisBook = BibleBook( self, BBB ) thisBook.objectNameString = 'VerseView XML Bible Book object' thisBook.objectTypeString = 'VerseView' #thisBook.sourceFilepath = self.sourceFilepath for element in book: if element.tag == VerseViewXMLBible.chapterTag: sublocation = "chapter in {}".format( BBB ) BibleOrgSysGlobals.checkXMLNoText( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) self.__validateAndExtractChapter( BBB, thisBook, element ) else: logging.error( "vb26 Expected to find {!r} but got {!r}".format( VerseViewXMLBible.chapterTag, element.tag ) ) if BibleOrgSysGlobals.verbosityLevel > 2: print( " Saving {} into results…".format( BBB ) ) self.stashBook( thisBook ) # end of VerseViewXMLBible.__validateAndExtractBook def __validateAndExtractChapter( self, BBB, thisBook, chapter ): """ Check/validate and extract chapter data from the given XML book record finding and saving chapter numbers and finding and saving verse elements. """ if BibleOrgSysGlobals.debugFlag and debuggingThisModule and BibleOrgSysGlobals.verbosityLevel > 3: print( _("Validating XML chapter…") ) # Process the chapter attributes first chapterNumber = numVerses = None for attrib,value in chapter.items(): if attrib=="n": chapterNumber = value else: logging.warning( "Unprocessed {!r} attribute ({}) in chapter element".format( attrib, value ) ) if chapterNumber: #print( BBB, 'c', chapterNumber ) thisBook.addLine( 'c', chapterNumber ) else: logging.error( "Missing 'n' attribute in chapter element for {}".format( BBB ) ) for element in chapter: if element.tag == VerseViewXMLBible.verseTag: location = "verse in {} {}".format( BBB, chapterNumber ) self.__validateAndExtractVerse( BBB, chapterNumber, thisBook, element ) else: logging.error( "sv34 Expected to find {!r} but got {!r}".format( VerseViewXMLBible.verseTag, element.tag ) ) # end of VerseViewXMLBible.__validateAndExtractChapter def __validateAndExtractVerse( self, BBB, chapterNumber, thisBook, verse ): """ Check/validate and extract verse data from the given XML book record finding and saving verse elements. """ if BibleOrgSysGlobals.debugFlag and debuggingThisModule and BibleOrgSysGlobals.verbosityLevel > 3: print( _("Validating XML verse…") ) location = "verse in {} {}".format( BBB, chapterNumber ) BibleOrgSysGlobals.checkXMLNoSubelements( verse, location, 'sg20' ) BibleOrgSysGlobals.checkXMLNoTail( verse, location, 'l5ks' ) # Handle verse attributes verseNumber = toVerseNumber = None for attrib,value in verse.items(): if attrib=="n": verseNumber = value else: logging.warning( "Unprocessed {!r} attribute ({}) in verse element".format( attrib, value ) ) if BibleOrgSysGlobals.debugFlag: assert verseNumber location = "{}:{}".format( location, verseNumber ) # Get a better location description #thisBook.addLine( 'v', verseNumber ) vText = '' if verse.text is None else verse.text if vText: vText = vText.strip() #if not vText: # This happens if a verse starts immediately with a style or note #logging.warning( "{} {}:{} has no text".format( BBB, chapterNumber, verseNumber ) ) ## Handle verse subelements (notes and styled portions) #for subelement in verse: #if subelement.tag == VerseViewXMLBible.noteTag: #sublocation = "note in " + location #noteType = None #for attrib,value in subelement.items(): #if attrib=="type": noteType = value #else: logging.warning( "Unprocessed {!r} attribute ({}) in style subelement".format( attrib, value ) ) #if noteType and noteType not in ('variant',): #logging.warning( "Unexpected {} note type in {}".format( noteType, BBB ) ) #nText, nTail = subelement.text, subelement.tail ##print( "note", BBB, chapterNumber, verseNumber, noteType, repr(nText), repr(nTail) ) #vText += "\\f + \\fk {} \\ft {}\\f*".format( noteType, nText ) if noteType else "\\f + \\ft {}\\f*".format( nText ) #if nTail: #if '\n' in nTail: #print( "VerseViewXMLBible.__validateAndExtractVerse: nTail {} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, nTail ) ) #nTail = nTail.replace( '\n', ' ' ) #vText += nTail #for subsubelement in subelement: #if subsubelement.tag == VerseViewXMLBible.styleTag: #subsublocation = "style in " + sublocation #BibleOrgSysGlobals.checkXMLNoSubelements( subsubelement, subsublocation, 'fyt4' ) #fs = css = idStyle = None #for attrib,value in subsubelement.items(): #if attrib=='fs': fs = value ##elif attrib=="css": css = value ##elif attrib=="id": idStyle = value #else: logging.warning( "Unprocessed {!r} attribute ({}) in style subsubelement".format( attrib, value ) ) #if BibleOrgSysGlobals.debugFlag: assert fs or css or idStyle #SFM = None #if fs == 'italic': SFM = '\\it' #elif fs == 'super': SFM = '\\bdit' #elif fs == 'emphasis': SFM = '\\em' #else: print( "fs is", fs, "css is", css, "idStyle is", idStyle ); halt ##if css == "font-style:italic": SFM = '\\it' ##elif css == "font-style:italic;font-weight:bold": SFM = '\\bdit' ##elif css == "color:#FF0000": SFM = '\\em' ##elif css == "font-size: x-small; color:#8B8378": SFM = '\\add' ##elif css is None and idStyle=='cl:divineName': SFM = '\\nd' ##else: print( "css is", css, "idStyle is", idStyle ); halt #sText, sTail = subsubelement.text.strip(), subsubelement.tail #if BibleOrgSysGlobals.debugFlag: assert sText #if SFM: vText += SFM+' ' + sText + SFM+'*' #else: vText += '\\sc ' + '['+css+']' + sText + '\\sc* ' # Use sc for unknown styles #if sTail: vText += sTail.strip() #else: logging.error( "df20 Expected to find {} but got {!r} in {}".format( VerseViewXMLBible.styleTag, subsubelement.tag, sublocation ) ) #elif subelement.tag == VerseViewXMLBible.styleTag: #sublocation = "style in " + location #BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation, 'f5gh' ) #fs = css = idStyle = None #for attrib,value in subelement.items(): #if attrib=="fs": fs = value ##elif attrib=="css": css = value ##elif attrib=="id": idStyle = value #else: logging.warning( "Unprocessed {!r} attribute ({}) in style subelement".format( attrib, value ) ) #if BibleOrgSysGlobals.debugFlag: assert fs #SFM = None #if fs == 'super': SFM = '\\bdit' #elif fs == 'emphasis': SFM = '\\em' #else: print( "fs is", fs, "css is", css, "idStyle is", idStyle ); halt ##if css == "font-style:italic": SFM = '\\it' ##elif css == "font-style:italic;font-weight:bold": SFM = '\\bdit' ##elif css == "color:#FF0000": SFM = '\\em' ##elif css == "font-size: x-small; color:#8B8378": SFM = '\\add' ##elif css is None and idStyle=='cl:divineName': SFM = '\\nd' ##else: print( "css is", css, "idStyle is", idStyle ); halt #sText, sTail = subelement.text.strip(), subelement.tail #if BibleOrgSysGlobals.debugFlag: assert sText ##print( BBB, chapterNumber, sublocation ) #if SFM: vText += SFM+' ' + sText + SFM+'*' #else: vText += '\\sc ' + '['+css+']' + sText + '\\sc* ' # Use sc for unknown styles #if sTail: vText += sTail.strip() #elif subelement.tag == VerseViewXMLBible.breakTag: #sublocation = "line break in " + location #BibleOrgSysGlobals.checkXMLNoText( subelement, sublocation, 'c1d4' ) #BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation, 'g4g8' ) #art = None #for attrib,value in subelement.items(): #if attrib=="art": #art = value #else: logging.warning( "Unprocessed {!r} attribute ({}) in style subelement".format( attrib, value ) ) #if BibleOrgSysGlobals.debugFlag: assert art == 'x-nl' ##print( BBB, chapterNumber, verseNumber ) ##assert vText #if vText: #thisBook.addLine( 'v', verseNumber + ' ' + vText ); verseNumber = None #vText = '' #thisBook.addLine( 'm', subelement.tail.strip() if subelement.tail else '' ) ##bTail = subelement.tail ##if bTail: vText = bTail.strip() #else: logging.error( "bd47 Expected to find NOTE or STYLE but got {!r} in {}".format( subelement.tag, location ) ) if vText: # This is the main text of the verse (follows the verse milestone) if '\n' in vText: print( "VerseViewXMLBible.__validateAndExtractVerse: vText {} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, vText ) ) vText = vText.replace( '\n', ' ' ) thisBook.addLine( 'v', verseNumber + ' ' + vText ); verseNumber = None
class HaggaiXMLBible( Bible ): """ Class for reading, validating, and converting HaggaiXMLBible XML. """ XMLNameSpace = "{http://www.w3.org/2001/XMLSchema-instance}" treeTag = 'XMLBIBLE' infoTag = 'INFORMATION' bookTag = 'BIBLEBOOK' chapterTag = 'CHAPTER' captionTag = 'CAPTION' paragraphTag = 'PARAGRAPH' verseTag = 'VERSE' noteTag = 'NOTE' styleTag = 'STYLE' breakTag = 'BR' def __init__( self, sourceFolder, givenName, encoding='utf-8' ): """ Constructor: just sets up the Haggai Bible object. """ # Setup and initialise the base class first Bible.__init__( self ) self.objectNameString = 'Haggai XML Bible object' self.objectTypeString = 'Haggai' # Now we can set our object variables self.sourceFolder, self.givenName, self.encoding = sourceFolder, givenName, encoding self.sourceFilepath = os.path.join( self.sourceFolder, self.givenName ) self.tree = self.header = None # Will hold the XML data # Get the data tables that we need for proper checking #self.ISOLanguages = ISO_639_3_Languages().loadData() self.genericBOS = BibleOrganizationalSystem( 'GENERIC-KJV-66-ENG' ) # Do a preliminary check on the readability of our file if not os.access( self.sourceFilepath, os.R_OK ): print( "HaggaiXMLBible: File {!r} is unreadable".format( self.sourceFilepath ) ) self.name = self.givenName #if self.name is None: #pass # end of HaggaiXMLBible.__init__ def load( self ): """ Load a single source XML file and load book elements. """ if BibleOrgSysGlobals.verbosityLevel > 2: print( _("Loading {}…").format( self.sourceFilepath ) ) try: self.tree = ElementTree().parse( self.sourceFilepath ) except ParseError as err: logging.critical( exp("Loader parse error in xml file {}: {} {}").format( self.givenName, sys.exc_info()[0], err ) ) #loadErrors.append( exp("Loader parse error in xml file {}: {} {}").format( self.givenName, sys.exc_info()[0], err ) ) #self.addPriorityError( 100, C, V, _("Loader parse error in xml file {}: {}").format( self.givenName, err ) ) if BibleOrgSysGlobals.debugFlag: assert len ( self.tree ) # Fail here if we didn't load anything at all # Find the main (bible) container if self.tree.tag == HaggaiXMLBible.treeTag: location = "Haggai XML file" BibleOrgSysGlobals.checkXMLNoText( self.tree, location, '4f6h' ) BibleOrgSysGlobals.checkXMLNoTail( self.tree, location, '1wk8' ) schema = name = status = BibleType = revision = version = lgid = None for attrib,value in self.tree.items(): if attrib == HaggaiXMLBible.XMLNameSpace + 'noNamespaceSchemaLocation': schema = value elif attrib == "biblename": name = value elif attrib == "lgid": lgid = value # In italian.xml this is set to "german" elif attrib == "status": status = value elif attrib == "type": BibleType = value elif attrib == "revision": revision = value elif attrib == 'version': version = value else: logging.warning( "Unprocessed {!r} attribute ({}) in main element".format( attrib, value ) ) if name: self.name = name if status: self.status = status if revision: self.revision = revision if version: self.version = version if self.tree[0].tag == 'INFORMATION': self.header = self.tree[0] self.tree.remove( self.header ) self.__validateAndExtractHeader() else: # Handle information records at the END of the file ix = len(self.tree) - 1 if self.tree[ix].tag == 'INFORMATION': self.header = self.tree[ix] self.tree.remove( self.header ) self.__validateAndExtractHeader() # Find the submain (book) containers for element in self.tree: if element.tag == HaggaiXMLBible.bookTag: sublocation = "book in " + location BibleOrgSysGlobals.checkXMLNoText( element, sublocation, 'g3g5' ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'd3f6' ) self.__validateAndExtractBook( element ) else: logging.error( "Expected to find {!r} but got {!r}".format( HaggaiXMLBible.bookTag, element.tag ) ) else: logging.error( "Expected to load {!r} but got {!r}".format( HaggaiXMLBible.treeTag, self.tree.tag ) ) self.doPostLoadProcessing() # end of HaggaiXMLBible.load def __validateAndExtractHeader( self ): """ Extracts information out of the header record, such as: <INFORMATION> <title>King James Version</title> <creator></creator> <subject>The Holy Bible</subject> <description>In 1604, King James I of England authorized that a new translation of the Bible into English be started. It was finished in 1611, just 85 years after the first translation of the New Testament into English appeared (Tyndale, 1526). The Authorized Version, or King James Version, quickly became the standard for English-speaking Protestants. Its flowing language and prose rhythm has had a profound influence on the literature of the past 300 years.</description> <publisher>FREE BIBLE SOFTWARE GROUP</publisher> <contributors /> <date>2009-01-23</date> <type>Bible</type> <format>Haggai XML Bible Markup Language</format> <identifier>kjv</identifier> <source>http://www.unboundbible.com/zips/index.cfm?lang=English</source> <language>ENG</language> <coverage>provide the Bible to the nations of the world</coverage> <rights>We believe that this Bible is found in the Public Domain.</rights> </INFORMATION> """ if BibleOrgSysGlobals.debugFlag: assert self.header location = 'Header' BibleOrgSysGlobals.checkXMLNoAttributes( self.header, location, 'j4j6' ) BibleOrgSysGlobals.checkXMLNoText( self.header, location, 'sk4l' ) BibleOrgSysGlobals.checkXMLNoTail( self.header, location, 'a2d4' ) # TODO: We probably need to rationalise some of the self.xxx stores for element in self.header: #print( "header", element.tag ) if element.tag == 'title': sublocation = "title in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if BibleOrgSysGlobals.debugFlag: assert element.text self.title = element.text elif element.tag == 'creator': sublocation = "creator in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if element.text: self.creator = element.text elif element.tag == 'subject': sublocation = "subject in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if element.text: self.subject = element.text elif element.tag == 'description': sublocation = "description in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if BibleOrgSysGlobals.debugFlag: assert element.text self.description = element.text elif element.tag == 'publisher': sublocation = "publisher in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if element.text: self.publisher = element.text elif element.tag == 'contributor': sublocation = "contributor in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'alj1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jjd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5gk78' ) if element.text: try: self.contributor = [ self.contributor, element.text ] # Put multiples into a list except AttributeError: self.contributor = element.text # Must be the first (and possibly only) one elif element.tag == 'contributors': sublocation = "contributors in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if element.text: self.contributors = element.text elif element.tag == 'date': sublocation = "date in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if BibleOrgSysGlobals.debugFlag: assert element.text self.date = element.text elif element.tag == 'type': sublocation = "type in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if element.text: self.documentType = element.text elif element.tag == 'format': sublocation = "format in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if BibleOrgSysGlobals.debugFlag: assert element.text if BibleOrgSysGlobals.debugFlag: assert element.text == 'Haggai XML Bible Markup Language' elif element.tag == 'identifier': sublocation = "identifier in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if BibleOrgSysGlobals.debugFlag: assert element.text self.identifier = element.text elif element.tag == 'source': sublocation = "source in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if BibleOrgSysGlobals.debugFlag: assert element.text self.source = element.text elif element.tag == 'language': sublocation = "language in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if BibleOrgSysGlobals.debugFlag: assert element.text self.language = element.text elif element.tag == 'coverage': sublocation = "coverage in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if element.text: self.coverage = element.text elif element.tag == 'rights': sublocation = "rights in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if element.text: self.rights = element.text else: logging.error( "Found unexpected {!r} tag in {}".format( element.tag, location ) ) # end of HaggaiXMLBible.__validateAndExtractHeader def __validateAndExtractBook( self, book ): """ Check/validate and extract book data from the given XML book record finding chapter subelements. """ if BibleOrgSysGlobals.verbosityLevel > 3: print( _("Validating XML book…") ) # Process the div attributes first BBB = bookName = bookShortName = bookNumber = None for attrib,value in book.items(): if attrib=="bnumber": bookNumber = value elif attrib=="bname": bookName = value elif attrib=="bsname": bookShortName = value else: logging.warning( "Unprocessed {!r} attribute ({}) in book element".format( attrib, value ) ) if bookNumber: try: BBB = BibleOrgSysGlobals.BibleBooksCodes.getBBBFromReferenceNumber( bookNumber ) except KeyError: logging.warning( "Unable to deduce which book is number={}, name={}, shortName={} -- ignoring it" \ .format( bookNumber, bookName, bookShortName ) ) elif bookName: BBB = self.genericBOS.getBBBFromText( bookName ) if BBB: if BibleOrgSysGlobals.verbosityLevel > 2: print( _("Validating {} {}…").format( BBB, bookName ) ) thisBook = BibleBook( self, BBB ) thisBook.objectNameString = 'Haggai XML Bible Book object' thisBook.objectTypeString = 'Haggai' #thisBook.sourceFilepath = self.sourceFilepath for element in book: if element.tag == HaggaiXMLBible.captionTag: sublocation = "caption in {}".format( BBB ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'jhl6' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, 'jk21' ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'kjh6' ) thisBook.addLine( 'mt', element.text ) elif element.tag == HaggaiXMLBible.chapterTag: sublocation = "chapter in {}".format( BBB ) BibleOrgSysGlobals.checkXMLNoText( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) self.__validateAndExtractChapter( BBB, thisBook, element ) else: logging.error( "Expected to find {!r} but got {!r}".format( HaggaiXMLBible.chapterTag, element.tag ) ) if BibleOrgSysGlobals.verbosityLevel > 2: print( " Saving {} into results…".format( BBB ) ) self.stashBook( thisBook ) # end of HaggaiXMLBible.__validateAndExtractBook def __validateAndExtractChapter( self, BBB, thisBook, chapter ): """ Check/validate and extract chapter data from the given XML book record finding and saving chapter numbers and finding and saving verse elements. """ if BibleOrgSysGlobals.verbosityLevel > 3: print( _("Validating XML chapter…") ) # Process the chapter attributes first chapterNumber = numVerses = None for attrib,value in chapter.items(): if attrib=="cnumber": chapterNumber = value else: logging.warning( "Unprocessed {!r} attribute ({}) in chapter element".format( attrib, value ) ) if chapterNumber: #print( BBB, 'c', chapterNumber ) thisBook.addLine( 'c', chapterNumber ) else: logging.error( "Missing 'n' attribute in chapter element for {}".format( BBB ) ) for element in chapter: if element.tag == HaggaiXMLBible.paragraphTag: location = "paragraph in {} {}".format( BBB, chapterNumber ) self.__validateAndExtractParagraph( BBB, chapterNumber, thisBook, element ) elif element.tag == HaggaiXMLBible.verseTag+'disabled': location = "verse in {} {}".format( BBB, chapterNumber ) self.__validateAndExtractVerse( BBB, chapterNumber, thisBook, element ) elif element.tag == HaggaiXMLBible.captionTag+'disabled': # Used in Psalms location = "caption in {} {}".format( BBB, chapterNumber ) BibleOrgSysGlobals.checkXMLNoTail( element, location, 'k5k8' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'd3f5' ) # Handle caption attributes vRef = None for attrib,value in element.items(): if attrib=="vref": vRef = value if BibleOrgSysGlobals.debugFlag: assert vRef == '1' else: logging.warning( "Unprocessed {!r} attribute ({}) in caption element".format( attrib, value ) ) if BibleOrgSysGlobals.debugFlag: assert vRef vText = element.text if not vText: logging.warning( "{} {}:{} has no text".format( BBB, chapterNumber, vRef ) ) if vText: # This is the main text of the caption #print( "{} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, vText ) ) thisBook.addLine( 'v', '0' + ' ' + vText ) # We save it as verse zero else: logging.error( "Expected to find {!r} but got {!r}".format( HaggaiXMLBible.verseTag, element.tag ) ) # end of HaggaiXMLBible.__validateAndExtractChapter def __validateAndExtractParagraph( self, BBB, chapterNumber, thisBook, paragraph ): """ Check/validate and extract paragraph data from the given XML book record finding and saving paragraphs and finding and saving verse elements. """ if BibleOrgSysGlobals.verbosityLevel > 3: print( _("Validating XML paragraph…") ) location = "paragraph in {} {}".format( BBB, chapterNumber ) BibleOrgSysGlobals.checkXMLNoAttributes( paragraph, location, 'brgw3' ) BibleOrgSysGlobals.checkXMLNoText( paragraph, location, 'brgw3' ) BibleOrgSysGlobals.checkXMLNoTail( paragraph, location, 'brgw3' ) thisBook.addLine( 'p', '' ) # Handle verse subelements (verses) for element in paragraph: if element.tag == HaggaiXMLBible.verseTag: location = "verse in {} {}".format( BBB, chapterNumber ) self.__validateAndExtractVerse( BBB, chapterNumber, thisBook, element ) elif element.tag == HaggaiXMLBible.captionTag+'disabled': # Used in Psalms location = "caption in {} {}".format( BBB, chapterNumber ) BibleOrgSysGlobals.checkXMLNoTail( element, location, 'k5k8' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'd3f5' ) # Handle caption attributes vRef = None for attrib,value in element.items(): if attrib=="vref": vRef = value if BibleOrgSysGlobals.debugFlag: assert vRef == '1' else: logging.warning( "Unprocessed {!r} attribute ({}) in caption element".format( attrib, value ) ) if BibleOrgSysGlobals.debugFlag: assert vRef vText = element.text if not vText: logging.warning( "{} {}:{} has no text".format( BBB, chapterNumber, vRef ) ) if vText: # This is the main text of the caption #print( "{} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, vText ) ) thisBook.addLine( 'v', '0' + ' ' + vText ) # We save it as verse zero else: logging.error( "Expected to find {!r} but got {!r}".format( HaggaiXMLBible.verseTag, element.tag ) ) # end of HaggaiXMLBible.__validateAndExtractParagraph def __validateAndExtractVerse( self, BBB, chapterNumber, thisBook, verse ): """ Check/validate and extract verse data from the given XML book record finding and saving verse elements. """ if BibleOrgSysGlobals.verbosityLevel > 3: print( _("Validating XML verse…") ) location = "verse in {} {}".format( BBB, chapterNumber ) BibleOrgSysGlobals.checkXMLNoTail( verse, location, 'l5ks' ) # Handle verse attributes verseNumber = toVerseNumber = None for attrib,value in verse.items(): if attrib=="vnumber": verseNumber = value else: logging.warning( "Unprocessed {!r} attribute ({}) in verse element".format( attrib, value ) ) if BibleOrgSysGlobals.debugFlag: assert verseNumber location = "{}:{}".format( location, verseNumber ) # Get a better location description #thisBook.addLine( 'v', verseNumber ) vText = '' if verse.text is None else verse.text if vText: vText = vText.strip() #if not vText: # This happens if a verse starts immediately with a style or note #logging.warning( "{} {}:{} has no text".format( BBB, chapterNumber, verseNumber ) ) # Handle verse subelements (notes and styled portions) for subelement in verse: if subelement.tag == HaggaiXMLBible.noteTag: sublocation = "note in " + location noteType = None for attrib,value in subelement.items(): if attrib=="type": noteType = value else: logging.warning( "Unprocessed {!r} attribute ({}) in style subelement".format( attrib, value ) ) if noteType and noteType not in ('variant',): logging.warning( "Unexpected {} note type in {}".format( noteType, BBB ) ) nText, nTail = subelement.text, subelement.tail #print( "note", BBB, chapterNumber, verseNumber, noteType, repr(nText), repr(nTail) ) vText += "\\f + \\fk {} \\ft {}\\f*".format( noteType, nText ) if noteType else "\\f + \\ft {}\\f*".format( nText ) if nTail: if '\n' in nTail: print( "HaggaiXMLBible.__validateAndExtractVerse: nTail {} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, nTail ) ) nTail = nTail.replace( '\n', ' ' ) vText += nTail for subsubelement in subelement: if subsubelement.tag == HaggaiXMLBible.styleTag: subsublocation = "style in " + sublocation BibleOrgSysGlobals.checkXMLNoSubelements( subsubelement, subsublocation, 'fyt4' ) fs = css = idStyle = None for attrib,value in subsubelement.items(): if attrib=='fs': fs = value #elif attrib=="css": css = value #elif attrib=="id": idStyle = value else: logging.warning( "Unprocessed {!r} attribute ({}) in style subsubelement".format( attrib, value ) ) if BibleOrgSysGlobals.debugFlag: assert fs or css or idStyle SFM = None if fs == 'italic': SFM = '\\it' elif fs == 'super': SFM = '\\bdit' elif fs == 'emphasis': SFM = '\\em' else: print( "fs is", fs, "css is", css, "idStyle is", idStyle ); halt #if css == "font-style:italic": SFM = '\\it' #elif css == "font-style:italic;font-weight:bold": SFM = '\\bdit' #elif css == "color:#FF0000": SFM = '\\em' #elif css == "font-size: x-small; color:#8B8378": SFM = '\\add' #elif css is None and idStyle=='cl:divineName': SFM = '\\nd' #else: print( "css is", css, "idStyle is", idStyle ); halt sText, sTail = subsubelement.text.strip(), subsubelement.tail if BibleOrgSysGlobals.debugFlag: assert sText if SFM: vText += SFM+' ' + sText + SFM+'*' else: vText += '\\sc ' + '['+css+']' + sText + '\\sc* ' # Use sc for unknown styles if sTail: vText += sTail.strip() else: logging.error( "Expected to find {} but got {!r} in {}".format( HaggaiXMLBible.styleTag, subsubelement.tag, sublocation ) ) elif subelement.tag == HaggaiXMLBible.styleTag: sublocation = "style in " + location BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation, 'f5gh' ) fs = css = idStyle = None for attrib,value in subelement.items(): if attrib=="fs": fs = value #elif attrib=="css": css = value #elif attrib=="id": idStyle = value else: logging.warning( "Unprocessed {!r} attribute ({}) in style subelement".format( attrib, value ) ) if BibleOrgSysGlobals.debugFlag: assert fs SFM = None if fs == 'super': SFM = '\\bdit' elif fs == 'emphasis': SFM = '\\em' else: print( "fs is", fs, "css is", css, "idStyle is", idStyle ); halt #if css == "font-style:italic": SFM = '\\it' #elif css == "font-style:italic;font-weight:bold": SFM = '\\bdit' #elif css == "color:#FF0000": SFM = '\\em' #elif css == "font-size: x-small; color:#8B8378": SFM = '\\add' #elif css is None and idStyle=='cl:divineName': SFM = '\\nd' #else: print( "css is", css, "idStyle is", idStyle ); halt sText, sTail = subelement.text.strip(), subelement.tail if BibleOrgSysGlobals.debugFlag: assert sText #print( BBB, chapterNumber, sublocation ) if SFM: vText += SFM+' ' + sText + SFM+'*' else: vText += '\\sc ' + '['+css+']' + sText + '\\sc* ' # Use sc for unknown styles if sTail: vText += sTail.strip() elif subelement.tag == HaggaiXMLBible.breakTag: sublocation = "line break in " + location BibleOrgSysGlobals.checkXMLNoText( subelement, sublocation, 'c1d4' ) BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation, 'g4g8' ) art = None for attrib,value in subelement.items(): if attrib=="art": art = value else: logging.warning( "Unprocessed {!r} attribute ({}) in style subelement".format( attrib, value ) ) if BibleOrgSysGlobals.debugFlag: assert art == 'x-nl' #print( BBB, chapterNumber, verseNumber ) #assert vText if vText: thisBook.addLine( 'v', verseNumber + ' ' + vText ); verseNumber = None vText = '' thisBook.addLine( 'm', subelement.tail.strip() if subelement.tail else '' ) #bTail = subelement.tail #if bTail: vText = bTail.strip() else: logging.error( "Expected to find NOTE or STYLE but got {!r} in {}".format( subelement.tag, location ) ) if vText: # This is the main text of the verse (follows the verse milestone) if '\n' in vText: print( "HaggaiXMLBible.__validateAndExtractVerse: vText {} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, vText ) ) vText = vText.replace( '\n', ' ' ) thisBook.addLine( 'v', verseNumber + ' ' + vText ); verseNumber = None
class VerseViewXMLBible(Bible): """ Class for reading, validating, and converting VerseViewXMLBible XML. """ XMLNameSpace = "{http://www.w3.org/2001/XMLSchema-instance}" treeTag = 'bible' filenameTag = 'fname' revisionTag = 'revision' titleTag = 'title' fontTag = 'font' copyrightTag = 'copyright' sizefactorTag = 'sizefactor' bookTag = 'b' chapterTag = 'c' verseTag = 'v' def __init__(self, sourceFolder, givenName, encoding='utf-8'): """ Constructor: just sets up the VerseView Bible object. """ # Setup and initialise the base class first Bible.__init__(self) self.objectNameString = 'VerseView XML Bible object' self.objectTypeString = 'VerseView' # Now we can set our object variables self.sourceFolder, self.givenName, self.encoding = sourceFolder, givenName, encoding self.sourceFilepath = os.path.join(self.sourceFolder, self.givenName) self.XMLTree = self.header = None # Will hold the XML data # Get the data tables that we need for proper checking #self.ISOLanguages = ISO_639_3_Languages().loadData() self.genericBOS = BibleOrganizationalSystem('GENERIC-KJV-66-ENG') # Do a preliminary check on the readability of our file if not os.access(self.sourceFilepath, os.R_OK): print("VerseViewXMLBible: File {!r} is unreadable".format( self.sourceFilepath)) self.name = self.givenName #if self.name is None: #pass # end of VerseViewXMLBible.__init__ def load(self): """ Load a single source XML file and load book elements. """ if BibleOrgSysGlobals.verbosityLevel > 2: print(_("Loading {}…").format(self.sourceFilepath)) self.XMLTree = ElementTree().parse(self.sourceFilepath) if BibleOrgSysGlobals.debugFlag: assert len( self.XMLTree) # Fail here if we didn't load anything at all if self.suppliedMetadata is None: self.suppliedMetadata = {} self.suppliedMetadata['VerseView'] = {} # Find the main (bible) container if self.XMLTree.tag == VerseViewXMLBible.treeTag: location = "VerseView XML file" BibleOrgSysGlobals.checkXMLNoText(self.XMLTree, location, '4f6h') BibleOrgSysGlobals.checkXMLNoAttributes(self.XMLTree, location, 'js24') BibleOrgSysGlobals.checkXMLNoTail(self.XMLTree, location, '1wk8') # Find the submain (various info and then book) containers bookNumber = 0 for element in self.XMLTree: if element.tag == VerseViewXMLBible.filenameTag: sublocation = "filename in " + location BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'jk86') BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, 'hjk7') BibleOrgSysGlobals.checkXMLNoTail(element, sublocation, 'bh09') #self.filename = element.text elif element.tag == VerseViewXMLBible.revisionTag: sublocation = "revision in " + location BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'jk86') BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, 'hjk7') BibleOrgSysGlobals.checkXMLNoTail(element, sublocation, 'bh09') self.suppliedMetadata['VerseView'][ 'Revision'] = element.text elif element.tag == VerseViewXMLBible.titleTag: sublocation = "title in " + location BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'jk86') BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, 'hjk7') BibleOrgSysGlobals.checkXMLNoTail(element, sublocation, 'bh09') self.suppliedMetadata['VerseView']['Title'] = element.text elif element.tag == VerseViewXMLBible.fontTag: sublocation = "font in " + location BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'jk86') BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, 'hjk7') BibleOrgSysGlobals.checkXMLNoTail(element, sublocation, 'bh09') self.suppliedMetadata['VerseView']['Font'] = element.text elif element.tag == VerseViewXMLBible.copyrightTag: sublocation = "copyright in " + location BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'jk86') BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, 'hjk7') BibleOrgSysGlobals.checkXMLNoTail(element, sublocation, 'bh09') self.suppliedMetadata['VerseView'][ 'Copyright'] = element.text elif element.tag == VerseViewXMLBible.sizefactorTag: sublocation = "sizefactor in " + location BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'jk86') BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, 'hjk7') BibleOrgSysGlobals.checkXMLNoTail(element, sublocation, 'bh09') if BibleOrgSysGlobals.debugFlag: assert element.text == '1' elif element.tag == VerseViewXMLBible.bookTag: sublocation = "book in " + location BibleOrgSysGlobals.checkXMLNoText(element, sublocation, 'g3g5') BibleOrgSysGlobals.checkXMLNoTail(element, sublocation, 'd3f6') bookNumber += 1 self.__validateAndExtractBook(element, bookNumber) else: logging.error( "xk15 Expected to find {!r} but got {!r}".format( VerseViewXMLBible.bookTag, element.tag)) else: logging.error("Expected to load {!r} but got {!r}".format( VerseViewXMLBible.treeTag, self.XMLTree.tag)) if BibleOrgSysGlobals.debugFlag or BibleOrgSysGlobals.verbosityLevel > 2: # These are all compulsory so they should all exist #print( "Filename is {!r}".format( self.filename ) ) print("Revision is {!r}".format( self.suppliedMetadata['VerseView']['Revision'])) print("Title is {!r}".format( self.suppliedMetadata['VerseView']['Title'])) print("Font is {!r}".format( self.suppliedMetadata['VerseView']['Font'])) print("Copyright is {!r}".format( self.suppliedMetadata['VerseView']['Copyright'])) #print( "SizeFactor is {!r}".format( self.sizeFactor ) ) self.applySuppliedMetadata( 'VerseView') # Copy some to self.settingsDict self.doPostLoadProcessing() # end of VerseViewXMLBible.load def __validateAndExtractBook(self, book, bookNumber): """ Check/validate and extract book data from the given XML book record finding chapter subelements. """ if BibleOrgSysGlobals.verbosityLevel > 3: print(_("Validating XML book…")) # Process the div attributes first BBB = bookName = None for attrib, value in book.items(): if attrib == "n": bookName = value else: logging.warning( "Unprocessed {!r} attribute ({}) in book element".format( attrib, value)) if bookName: BBB = self.genericBOS.getBBBFromText(bookName) if BBB is None: adjustedBookName = BibleOrgSysGlobals.removeAccents(bookName) if adjustedBookName != bookName: BBB = self.genericBOS.getBBBFromText(adjustedBookName) BBB2 = BibleOrgSysGlobals.BibleBooksCodes.getBBBFromReferenceNumber( bookNumber) if BBB2 != BBB: # Just double check using the book number if BibleOrgSysGlobals.debugFlag or BibleOrgSysGlobals.verbosityLevel > 2: print("Assuming that book {} {!r} is {} (not {})".format( bookNumber, bookName, BBB2, BBB)) BBB = BBB2 #print( BBB ); halt if BBB: if BibleOrgSysGlobals.verbosityLevel > 2: print(_("Validating {} {}…").format(BBB, bookName)) thisBook = BibleBook(self, BBB) thisBook.objectNameString = 'VerseView XML Bible Book object' thisBook.objectTypeString = 'VerseView' #thisBook.sourceFilepath = self.sourceFilepath for element in book: if element.tag == VerseViewXMLBible.chapterTag: sublocation = "chapter in {}".format(BBB) BibleOrgSysGlobals.checkXMLNoText(element, sublocation, 'j3jd') BibleOrgSysGlobals.checkXMLNoTail(element, sublocation, 'al1d') self.__validateAndExtractChapter(BBB, thisBook, element) else: logging.error( "vb26 Expected to find {!r} but got {!r}".format( VerseViewXMLBible.chapterTag, element.tag)) if BibleOrgSysGlobals.verbosityLevel > 2: print(" Saving {} into results…".format(BBB)) self.stashBook(thisBook) # end of VerseViewXMLBible.__validateAndExtractBook def __validateAndExtractChapter(self, BBB, thisBook, chapter): """ Check/validate and extract chapter data from the given XML book record finding and saving chapter numbers and finding and saving verse elements. """ if BibleOrgSysGlobals.debugFlag and debuggingThisModule and BibleOrgSysGlobals.verbosityLevel > 3: print(_("Validating XML chapter…")) # Process the chapter attributes first chapterNumber = numVerses = None for attrib, value in chapter.items(): if attrib == "n": chapterNumber = value else: logging.warning( "Unprocessed {!r} attribute ({}) in chapter element". format(attrib, value)) if chapterNumber: #print( BBB, 'c', chapterNumber ) thisBook.addLine('c', chapterNumber) else: logging.error( "Missing 'n' attribute in chapter element for {}".format(BBB)) for element in chapter: if element.tag == VerseViewXMLBible.verseTag: location = "verse in {} {}".format(BBB, chapterNumber) self.__validateAndExtractVerse(BBB, chapterNumber, thisBook, element) else: logging.error("sv34 Expected to find {!r} but got {!r}".format( VerseViewXMLBible.verseTag, element.tag)) # end of VerseViewXMLBible.__validateAndExtractChapter def __validateAndExtractVerse(self, BBB, chapterNumber, thisBook, verse): """ Check/validate and extract verse data from the given XML book record finding and saving verse elements. """ if BibleOrgSysGlobals.debugFlag and debuggingThisModule and BibleOrgSysGlobals.verbosityLevel > 3: print(_("Validating XML verse…")) location = "verse in {} {}".format(BBB, chapterNumber) BibleOrgSysGlobals.checkXMLNoSubelements(verse, location, 'sg20') BibleOrgSysGlobals.checkXMLNoTail(verse, location, 'l5ks') # Handle verse attributes verseNumber = toVerseNumber = None for attrib, value in verse.items(): if attrib == "n": verseNumber = value else: logging.warning( "Unprocessed {!r} attribute ({}) in verse element".format( attrib, value)) if BibleOrgSysGlobals.debugFlag: assert verseNumber location = "{}:{}".format( location, verseNumber) # Get a better location description #thisBook.addLine( 'v', verseNumber ) vText = '' if verse.text is None else verse.text if vText: vText = vText.strip() #if not vText: # This happens if a verse starts immediately with a style or note #logging.warning( "{} {}:{} has no text".format( BBB, chapterNumber, verseNumber ) ) ## Handle verse subelements (notes and styled portions) #for subelement in verse: #if subelement.tag == VerseViewXMLBible.noteTag: #sublocation = "note in " + location #noteType = None #for attrib,value in subelement.items(): #if attrib=="type": noteType = value #else: logging.warning( "Unprocessed {!r} attribute ({}) in style subelement".format( attrib, value ) ) #if noteType and noteType not in ('variant',): #logging.warning( "Unexpected {} note type in {}".format( noteType, BBB ) ) #nText, nTail = subelement.text, subelement.tail ##print( "note", BBB, chapterNumber, verseNumber, noteType, repr(nText), repr(nTail) ) #vText += "\\f + \\fk {} \\ft {}\\f*".format( noteType, nText ) if noteType else "\\f + \\ft {}\\f*".format( nText ) #if nTail: #if '\n' in nTail: #print( "VerseViewXMLBible.__validateAndExtractVerse: nTail {} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, nTail ) ) #nTail = nTail.replace( '\n', ' ' ) #vText += nTail #for sub2element in subelement: #if sub2element.tag == VerseViewXMLBible.styleTag: #sub2location = "style in " + sublocation #BibleOrgSysGlobals.checkXMLNoSubelements( sub2element, sub2location, 'fyt4' ) #fs = css = idStyle = None #for attrib,value in sub2element.items(): #if attrib=='fs': fs = value ##elif attrib=="css": css = value ##elif attrib=="id": idStyle = value #else: logging.warning( "Unprocessed {!r} attribute ({}) in style sub2element".format( attrib, value ) ) #if BibleOrgSysGlobals.debugFlag: assert fs or css or idStyle #SFM = None #if fs == 'italic': SFM = '\\it' #elif fs == 'super': SFM = '\\bdit' #elif fs == 'emphasis': SFM = '\\em' #else: print( "fs is", fs, "css is", css, "idStyle is", idStyle ); halt ##if css == "font-style:italic": SFM = '\\it' ##elif css == "font-style:italic;font-weight:bold": SFM = '\\bdit' ##elif css == "color:#FF0000": SFM = '\\em' ##elif css == "font-size: x-small; color:#8B8378": SFM = '\\add' ##elif css is None and idStyle=='cl:divineName': SFM = '\\nd' ##else: print( "css is", css, "idStyle is", idStyle ); halt #sText, sTail = sub2element.text.strip(), sub2element.tail #if BibleOrgSysGlobals.debugFlag: assert sText #if SFM: vText += SFM+' ' + sText + SFM+'*' #else: vText += '\\sc ' + '['+css+']' + sText + '\\sc* ' # Use sc for unknown styles #if sTail: vText += sTail.strip() #else: logging.error( "df20 Expected to find {} but got {!r} in {}".format( VerseViewXMLBible.styleTag, sub2element.tag, sublocation ) ) #elif subelement.tag == VerseViewXMLBible.styleTag: #sublocation = "style in " + location #BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation, 'f5gh' ) #fs = css = idStyle = None #for attrib,value in subelement.items(): #if attrib=="fs": fs = value ##elif attrib=="css": css = value ##elif attrib=="id": idStyle = value #else: logging.warning( "Unprocessed {!r} attribute ({}) in style subelement".format( attrib, value ) ) #if BibleOrgSysGlobals.debugFlag: assert fs #SFM = None #if fs == 'super': SFM = '\\bdit' #elif fs == 'emphasis': SFM = '\\em' #else: print( "fs is", fs, "css is", css, "idStyle is", idStyle ); halt ##if css == "font-style:italic": SFM = '\\it' ##elif css == "font-style:italic;font-weight:bold": SFM = '\\bdit' ##elif css == "color:#FF0000": SFM = '\\em' ##elif css == "font-size: x-small; color:#8B8378": SFM = '\\add' ##elif css is None and idStyle=='cl:divineName': SFM = '\\nd' ##else: print( "css is", css, "idStyle is", idStyle ); halt #sText, sTail = subelement.text.strip(), subelement.tail #if BibleOrgSysGlobals.debugFlag: assert sText ##print( BBB, chapterNumber, sublocation ) #if SFM: vText += SFM+' ' + sText + SFM+'*' #else: vText += '\\sc ' + '['+css+']' + sText + '\\sc* ' # Use sc for unknown styles #if sTail: vText += sTail.strip() #elif subelement.tag == VerseViewXMLBible.breakTag: #sublocation = "line break in " + location #BibleOrgSysGlobals.checkXMLNoText( subelement, sublocation, 'c1d4' ) #BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation, 'g4g8' ) #art = None #for attrib,value in subelement.items(): #if attrib=="art": #art = value #else: logging.warning( "Unprocessed {!r} attribute ({}) in style subelement".format( attrib, value ) ) #if BibleOrgSysGlobals.debugFlag: assert art == 'x-nl' ##print( BBB, chapterNumber, verseNumber ) ##assert vText #if vText: #thisBook.addLine( 'v', verseNumber + ' ' + vText ); verseNumber = None #vText = '' #thisBook.addLine( 'm', subelement.tail.strip() if subelement.tail else '' ) ##bTail = subelement.tail ##if bTail: vText = bTail.strip() #else: logging.error( "bd47 Expected to find NOTE or STYLE but got {!r} in {}".format( subelement.tag, location ) ) if vText: # This is the main text of the verse (follows the verse milestone) if '\n' in vText: print( "VerseViewXMLBible.__validateAndExtractVerse: vText {} {}:{} {!r}" .format(BBB, chapterNumber, verseNumber, vText)) vText = vText.replace('\n', ' ') thisBook.addLine('v', verseNumber + ' ' + vText) verseNumber = None
class OpenSongXMLBible(Bible): """ Class for reading, validating, and converting OpenSong Bible XML. """ treeTag = 'bible' bookTag = 'b' chapterTag = 'c' verseTag = 'v' def __init__(self, sourceFolder, givenName, encoding='utf-8'): """ Constructor: just sets up the XML Bible file converter object. """ # Setup and initialise the base class first if BibleOrgSysGlobals.debugFlag: print("OpenSongXMLBible( {}, {}, {} )".format( sourceFolder, givenName, encoding)) Bible.__init__(self) self.objectNameString = 'OpenSong XML Bible object' self.objectTypeString = 'OpenSong' # Now we can set our object variables self.sourceFolder, self.givenName, self.encoding = sourceFolder, givenName, encoding self.sourceFilepath = os.path.join(self.sourceFolder, self.givenName) self.tree = None # Will hold the XML data # Get the data tables that we need for proper checking #self.ISOLanguages = ISO_639_3_Languages().loadData() self.genericBOS = BibleOrganizationalSystem('GENERIC-KJV-66-ENG') # Do a preliminary check on the readability of our file if not os.access(self.sourceFilepath, os.R_OK): print("OpenSongXMLBible: File {!r} is unreadable".format( self.sourceFilepath)) self.name = self.givenName #if self.name is None: #pass # end of OpenSongXMLBible.__init__ def load(self): """ Load a single source XML file and load book elements. """ if BibleOrgSysGlobals.verbosityLevel > 2: print(_("Loading {}…").format(self.sourceFilepath)) self.tree = ElementTree().parse(self.sourceFilepath) if BibleOrgSysGlobals.debugFlag: assert len( self.tree) # Fail here if we didn't load anything at all # Find the main (bible) container if self.tree.tag == OpenSongXMLBible.treeTag: location = "XML file" BibleOrgSysGlobals.checkXMLNoText(self.tree, location, '4f6h') BibleOrgSysGlobals.checkXMLNoTail(self.tree, location, '1wk8') name = shortName = None for attrib, value in self.tree.items(): if attrib == "n": name = value elif attrib == "sn": shortName = value else: logging.warning( "Unprocessed {!r} attribute ({}) in main element". format(attrib, value)) # Find the submain (book) containers for element in self.tree: if element.tag == OpenSongXMLBible.bookTag: sublocation = "book in " + location BibleOrgSysGlobals.checkXMLNoText(element, sublocation, 'g3g5') BibleOrgSysGlobals.checkXMLNoTail(element, sublocation, 'd3f6') self.__validateAndExtractBook(element) elif element.tag == 'OT': pass elif element.tag == 'NT': pass else: logging.error("Expected to find {!r} but got {!r}".format( OpenSongXMLBible.bookTag, element.tag)) else: logging.error("Expected to load {!r} but got {!r}".format( OpenSongXMLBible.treeTag, self.tree.tag)) self.doPostLoadProcessing() # end of OpenSongXMLBible.load def __validateAndExtractBook(self, book): """ Check/validate and extract book data from the given XML book record finding chapter subelements. """ global BibleBooksNames if BibleOrgSysGlobals.verbosityLevel > 3: print(_("Validating OpenSong XML book…")) # Process the div attributes first BBB = bookName = None for attrib, value in book.items(): if attrib == "n": bookName = value else: logging.warning( "Unprocessed {!r} attribute ({}) in book element".format( attrib, value)) if bookName: BBB = self.genericBOS.getBBBFromText( bookName) # Booknames are usually in English if not BBB: # wasn't English if BibleBooksNames is None: BibleBooksNames = BibleBooksNamesSystems().loadData() BBB = BibleBooksNames.getBBBFromText( bookName) # Try non-English booknames #print( "bookName", bookName, BBB ) if BBB: if BibleOrgSysGlobals.verbosityLevel > 2: print(_("Validating {} {}…").format(BBB, bookName)) thisBook = BibleBook(self, BBB) thisBook.objectNameString = 'OpenSong XML Bible Book object' thisBook.objectTypeString = 'OpenSong' #thisBook.sourceFilepath = self.sourceFilepath USFMAbbreviation = BibleOrgSysGlobals.BibleBooksCodes.getUSFMAbbreviation( BBB) thisBook.addLine( 'id', '{} imported by {}'.format(USFMAbbreviation.upper(), ProgNameVersion)) thisBook.addLine('h', bookName) thisBook.addLine('mt1', bookName) for element in book: if element.tag == OpenSongXMLBible.chapterTag: sublocation = "chapter in {}".format(BBB) BibleOrgSysGlobals.checkXMLNoText( element, sublocation, 'j3jd') BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d') self.__validateAndExtractChapter( BBB, thisBook, element) else: logging.error( "Expected to find {!r} but got {!r}".format( OpenSongXMLBible.chapterTag, element.tag)) if BibleOrgSysGlobals.verbosityLevel > 2: print(" Saving {} into results…".format(BBB)) self.stashBook(thisBook) else: logging.error( _("OpenSong load doesn't recognize book name: {!r}"). format(bookName)) # no BBB else: logging.error( _("OpenSong load can't find a book name")) # no bookName # end of OpenSongXMLBible.__validateAndExtractBook def __validateAndExtractChapter(self, BBB, thisBook, chapter): """ Check/validate and extract chapter data from the given XML book record finding and saving chapter numbers and finding and saving verse elements. """ if BibleOrgSysGlobals.verbosityLevel > 3: print(_("Validating XML chapter…")) # Process the div attributes first chapterNumber = numVerses = None for attrib, value in chapter.items(): if attrib == "n": chapterNumber = value elif attrib == "VERSES": numVerses = value else: logging.warning( "Unprocessed {!r} attribute ({}) in chapter element". format(attrib, value)) if chapterNumber: #print( BBB, 'c', chapterNumber ) chapterNumber = chapterNumber.replace( 'of Solomon ', '') # Fix a mistake in the Chinese_SU module thisBook.addLine('c', chapterNumber) else: logging.error( "Missing 'n' attribute in chapter element for {}".format(BBB)) for element in chapter: if element.tag == OpenSongXMLBible.verseTag: sublocation = "verse in {} {}".format(BBB, chapterNumber) BibleOrgSysGlobals.checkXMLNoTail(element, sublocation, 'l5ks') verseNumber = toVerseNumber = None for attrib, value in element.items(): if attrib == "n": verseNumber = value elif attrib == "t": toVerseNumber = value else: logging.warning( "Unprocessed {!r} attribute ({}) in verse element". format(attrib, value)) if BibleOrgSysGlobals.debugFlag: assert verseNumber #thisBook.addLine( 'v', verseNumber ) vText = element.text if element.text else '' for subelement in element: sub2location = "{} in {}".format(subelement.tag, sublocation) BibleOrgSysGlobals.checkXMLNoAttributes( subelement, sub2location, 'ks03') BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sub2location, 'ks05') if subelement.tag == 'i': vText += '\\it {}\\it*{}'.format( subelement.text, subelement.tail) else: logging.error( "Expected to find 'i' but got {!r}".format( subelement.tag)) vText += element.tail if element.tail else '' if not vText: logging.warning("{} {}:{} has no text".format( BBB, chapterNumber, verseNumber)) #print( 'vText1', vText ) if vText: # This is the main text of the verse (follows the verse milestone) #print( "{} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, vText ) ) if '\n' in vText: # This is how they represent poety #print( "vText", repr(vText), repr(element.text) ) for j, textBit in enumerate(vText.split('\n')): if j == 0: thisBook.addLine('q1', '') thisBook.addLine('v', verseNumber + ' ' + textBit) else: thisBook.addLine('q1', textBit) else: # Just one verse line thisBook.addLine('v', verseNumber + ' ' + vText) #print( 'vText2', vText ) else: logging.error("Expected to find {!r} but got {!r}".format( OpenSongXMLBible.verseTag, element.tag))
def load(self): """ Load a single source file and load book elements. """ if BibleOrgSysGlobals.verbosityLevel > 2: print(_("Loading {}…").format(self.sourceFilepath)) global BOS66, BOS81, BOSx if BOS66 is None: BOS66 = BibleOrganizationalSystem('GENERIC-KJV-66-ENG') if BOS81 is None: BOS81 = BibleOrganizationalSystem('GENERIC-KJV-80-ENG') if BOSx is None: BOSx = BibleOrganizationalSystem('GENERIC-ENG') if self.suppliedMetadata is None: self.suppliedMetadata = {} lastLine, lineCount = '', 0 vplType = bookCodeText = lastBookCodeText = BBB = lastBBB = metadataName = None lastChapterNumber = lastVerseNumber = -1 lastVText = '' thisBook = None settingsDict = {} with open(self.sourceFilepath, encoding=self.encoding ) as myFile: # Automatically closes the file when done for line in myFile: lineCount += 1 if line[-1] == '\n': line = line[:-1] # Removing trailing newline character if not line: continue # Just discard blank lines if lineCount == 1: if self.encoding.lower() == 'utf-8' and line[0] == chr( 65279): #U+FEFF or \ufeff logging.info( " VPLBible.load: Detected Unicode Byte Order Marker (BOM)" ) line = line[ 1:] # Remove the Unicode Byte Order Marker (BOM) # Try to identify the VPL type match = re.search( '^(\\w{2,5}?)\\s(\\d{1,3})[:\\.](\\d{1,3})\\s', line) if match: vplType = 1 else: match = re.search('^(\\d{8})\\s', line) if match: vplType = 2 else: match = re.search('^# language_name:\\s', line) if match: vplType = 3 #else: #match = re.search( '^; TITLE:\\s', line ) #if match: vplType = 4 if match: if BibleOrgSysGlobals.debugFlag: print( "First line got type #{} {!r} match from {!r}". format(vplType, match.group(0), line)) else: if BibleOrgSysGlobals.verbosityLevel > 3: print( "VPLBible.load: (unexpected) first line was {!r} in {}" .format(line, self.sourceFilepath)) if BibleOrgSysGlobals.debugFlag and debuggingThisModule: halt continue #print( 'vplType', vplType ) #print ( 'VPL file line is "' + line + '"' ) lastLine = line # Process header stuff if vplType == 3: if line.startswith('# language_name:'): string = line[16:].strip() if string and string != 'Not available': settingsDict['LanguageName'] = string continue elif line.startswith('# closest ISO 639-3:'): string = line[20:].strip() if string and string != 'Not available': settingsDict['ISOLanguageCode'] = string continue elif line.startswith('# year_short:'): string = line[13:].strip() if string and string != 'Not available': settingsDict['Year.short'] = string continue elif line.startswith('# year_long:'): string = line[12:].strip() if string and string != 'Not available': settingsDict['Year.long'] = string continue elif line.startswith('# title:'): string = line[8:].strip() if string and string != 'Not available': settingsDict['WorkTitle'] = string continue elif line.startswith('# URL:'): string = line[6:].strip() if string and string != 'Not available': settingsDict['URL'] = string continue elif line.startswith('# copyright_short:'): string = line[18:].strip() if string and string != 'Not available': settingsDict['Copyright.short'] = string continue elif line.startswith('# copyright_long:'): string = line[17:].strip() if string and string != 'Not available': settingsDict['Copyright.long'] = string continue elif line[0] == '#': logging.warning( "VPLBible.load {} is skipping unknown line: {}". format(vplType, line)) continue # Just discard comment lines #elif vplType == 4: #if line.startswith( '; TITLE:' ): #string = line[8:].strip() #if string: settingsDict['TITLE'] = string #continue #elif line.startswith( '; ABBREVIATION:' ): #string = line[15:].strip() #if string: settingsDict['ABBREVIATION'] = string #continue #elif line.startswith( '; HAS ITALICS:' ): #string = line[15:].strip() #if string: settingsDict['HAS_ITALICS'] = string #continue #elif line.startswith( '; HAS FOOTNOTES:' ): #string = line[15:].strip() #if string: settingsDict['HAS_FOOTNOTES'] = string #continue #elif line.startswith( '; HAS FOOTNOTES' ): #string = line[14:].strip() #if string: settingsDict['HAS_FOOTNOTES'] = string #continue #elif line.startswith( '; HAS REDLETTER:' ): #string = line[15:].strip() #if string: settingsDict['HAS_REDLETTER'] = string #continue #elif line[0]==';': #logging.warning( "VPLBible.load{} is skipping unknown header/comment line: {}".format( vplType, line ) ) #continue # Just discard comment lines # Process the main segment if vplType == 1: bits = line.split(' ', 2) #print( self.givenName, BBB, bits ) if len(bits) == 3 and ':' in bits[1]: bookCodeText, CVString, vText = bits chapterNumberString, verseNumberString = CVString.split( ':') #print( "{} {} bc={!r} c={!r} v={!r} txt={!r}".format( self.givenName, BBB, bookCodeText, chapterNumberString, verseNumberString, vText ) ) if chapterNumberString == '': chapterNumberString = '1' # Handle a bug in some single chapter books in VPL else: print("Unexpected number of bits", self.givenName, BBB, bookCodeText, chapterNumberString, verseNumberString, len(bits), bits) if not bookCodeText and not chapterNumberString and not verseNumberString: print("Skipping empty line in {} {} {} {}:{}".format( self.givenName, BBB, bookCodeText, chapterNumberString, verseNumberString)) continue if BibleOrgSysGlobals.debugFlag: assert 2 <= len(bookCodeText) <= 4 if BibleOrgSysGlobals.debugFlag: assert chapterNumberString.isdigit() if not verseNumberString.isdigit(): logging.error( "Invalid verse number field at {}/{} {}:{!r}". format(bookCodeText, BBB, chapterNumberString, verseNumberString)) if BibleOrgSysGlobals.debugFlag and debuggingThisModule: assert verseNumberString.isdigit() continue chapterNumber = int(chapterNumberString) verseNumber = int(verseNumberString) if bookCodeText != lastBookCodeText: # We've started a new book lastBBB = BBB #if bookCodeText in ('Ge',): BBB = 'GEN' if bookCodeText == 'Le' and lastBBB == 'GEN': BBB = 'LEV' elif bookCodeText in ('Jud', ) and lastBBB == 'JOS': BBB = 'JDG' #elif bookCodeText in ('Es',): BBB = 'EST' #elif bookCodeText in ('Pr',): BBB = 'PRO' #elif bookCodeText in ('So','SOL') and lastBBB == 'ECC': BBB = 'SNG' #elif bookCodeText in ('La',) and lastBBB == 'JER': BBB = 'LAM' #elif bookCodeText == 'PHI' and lastBBB == 'EPH': BBB = 'PHP' #elif bookCodeText == 'PHI' and self.givenName == "bjp_vpl": BBB = 'PHP' # Hack for incomplete NT #elif bookCodeText in ('Jude',): BBB = 'JDE' #elif bookCodeText == 'PRA' and lastBBB == 'LJE': BBB = 'PAZ' #elif bookCodeText == 'PRM' and lastBBB == 'GES': BBB = 'MAN' else: BBB = BOS66.getBBBFromText( bookCodeText) # Try to guess if not BBB: BBB = BOS81.getBBBFromText( bookCodeText) # Try to guess if not BBB: BBB = BOSx.getBBBFromText( bookCodeText) # Try to guess if not BBB: BBB = BibleOrgSysGlobals.BibleBooksCodes.getBBBFromText( bookCodeText) # Try to guess if not BBB: logging.critical( "VPL Bible: Unable to determine book code from text {!r} after {!r}={}" .format(bookCodeText, lastBookCodeText, lastBBB)) halt # Handle special formatting # [square-brackets] are for Italicized words # <angle-brackets> are for the Words of Christ in Red # «chevrons» are for the Titles in the Book of Psalms. vText = vText.replace( '[', '\\add ' ).replace( ']', '\\add*' ) \ .replace( '<', '\\wj ' ).replace( '>', '\\wj*' ) if vText and vText[0] == '«': #print( "Oh!", BBB, chapterNumberString, verseNumberString, repr(vText) ) if BBB == 'PSA' and verseNumberString == '1': # Psalm title vBits = vText[1:].split('»') #print( "vBits", vBits ) thisBook.addLine('d', vBits[0]) # Psalm title vText = vBits[1].lstrip() # Handle the verse info #if verseNumber==lastVerseNumber and vText==lastVText: #logging.warning( _("Ignored duplicate verse line in {} {} {} {}:{}").format( self.givenName, BBB, bookCodeText, chapterNumberString, verseNumberString ) ) #continue if BBB == 'PSA' and verseNumberString == '1' and vText.startswith( '<') and self.givenName == 'basic_english': # Move Psalm titles to verse zero verseNumber = 0 #if verseNumber < lastVerseNumber: #logging.warning( _("Ignored receding verse number (from {} to {}) in {} {} {} {}:{}").format( lastVerseNumber, verseNumber, self.givenName, BBB, bookCodeText, chapterNumberString, verseNumberString ) ) #elif verseNumber == lastVerseNumber: #if vText == lastVText: #logging.warning( _("Ignored duplicated {} verse in {} {} {} {}:{}").format( verseNumber, self.givenName, BBB, bookCodeText, chapterNumberString, verseNumberString ) ) #else: #logging.warning( _("Ignored duplicated {} verse number in {} {} {} {}:{}").format( verseNumber, self.givenName, BBB, bookCodeText, chapterNumberString, verseNumberString ) ) elif vplType in (2, 3): bits = line.split('\t', 1) #print( self.givenName, BBB, bits ) bookNumberString, chapterNumberString, verseNumberString = bits[ 0][:2], bits[0][2:5], bits[0][5:] #print( bookNumberString, chapterNumberString, verseNumberString ) while len(chapterNumberString ) > 1 and chapterNumberString[0] == '0': chapterNumberString = chapterNumberString[ 1:] # Remove leading zeroes while len(verseNumberString ) > 1 and verseNumberString[0] == '0': verseNumberString = verseNumberString[ 1:] # Remove leading zeroes bookCodeText, chapterNumber, verseNumber = int( bookNumberString), int(chapterNumberString), int( verseNumberString) vText = bits[1].replace(' ,',',').replace(' .','.').replace(' ;',';').replace(' :',':') \ .replace(' !','!').replace(' )',')').replace(' ]',']').replace(' ”','”') \ .replace('“ ','“').replace('( ','(').replace('[ ','[') #.replace(' !','!') if bookCodeText != lastBookCodeText: # We've started a new book lastBBB = BBB bnDict = { 67: 'TOB', 68: 'JDT', 69: 'ESG', 70: 'WIS', 71: 'SIR', 72: 'BAR', 73: 'LJE', 74: 'PAZ', 75: 'SUS', 76: 'BEL', 77: 'MA1', 78: 'MA2', 79: 'MA3', 80: 'MA4', 81: 'ES1', 82: 'ES2', 83: 'MAN', 84: 'PS2', 85: 'PSS', 86: 'ODE', } if 1 <= bookCodeText <= 66: BBB = BibleOrgSysGlobals.BibleBooksCodes.getBBBFromReferenceNumber( bookCodeText) else: BBB = bnDict[bookCodeText] #elif vplType == 4: #if line.startswith( '$$ ' ): #if metadataName and metadataContents: #settingsDict[metadataName] = metadataContents #metadataName = None #pointer = line[3:] ##print( "pointer", repr(pointer) ) #if pointer and pointer[0]=='{' and pointer[-1]=='}': #metadataName = pointer[1:-1] #if metadataName: ##print( "metadataName", repr(metadataName) ) #metadataContents = '' #else: # let's assume it's a BCV reference #pointer = pointer.replace( '1 K','1K' ).replace( '2 K','2K' ) \ #.replace( '1 Chr','1Chr' ).replace( '2 Chr','2Chr' ) \ #.replace( '1 Cor','1Cor' ).replace( '2 Cor','2Cor' ) \ #.replace( '1 Thess','1Thess' ).replace( '2 Thess','2Thess' ) \ #.replace( '1 Tim','1Tim' ).replace( '2 Tim','2Tim' ) \ #.replace( '1 Pet','1Pet' ).replace( '2 Pet','2Pet' ) \ #.replace( '1 J','1J' ).replace( '2 J','2J' ).replace( '3 J','3J' ) #B_CV_Bits = pointer.split( ' ', 1 ) #if len(B_CV_Bits) == 2 and ':' in B_CV_Bits[1]: #bookCodeText, CVString = B_CV_Bits #chapterNumberString, verseNumberString = CVString.split( ':' ) #chapterNumber = int( chapterNumberString ) #verseNumber = int( verseNumberString ) #if bookCodeText != lastBookCodeText: # We've started a new book #if bookCodeText in ('Ge',): BBB = 'GEN' #elif bookCodeText in ('Le',): BBB = 'LEV' #elif bookCodeText in ('La',): BBB = 'LAM' #else: ##print( "4bookCodeText =", repr(bookCodeText) ) ##BBB = BOS.getBBBFromText( bookCodeText ) # Try to guess #BBB = BOS66.getBBBFromText( bookCodeText ) # Try to guess #if not BBB: BBB = BOS81.getBBBFromText( bookCodeText ) # Try to guess #if not BBB: BBB = BOSx.getBBBFromText( bookCodeText ) # Try to guess ##print( "4BBB =", repr(BBB) ) #else: print( "Unexpected number of bits", self.givenName, BBB, bookCodeText, chapterNumberString, verseNumberString, len(bits), bits ) #continue # Just save the pointer information which refers to the text on the next line #else: # it's not a $$ line #text = line ##print( "text", repr(text) ) #if metadataName: #metadataContents += ('\n' if metadataContents else '') + text #continue #else: #vText = text ## Handle bits like (<scripref>Pr 2:7</scripref>) #vText = vText.replace( '(<scripref>', '\\x - \\xt ' ).replace( '</scripref>)', '\\x*' ) #vText = vText.replace( '<scripref>', '\\x - \\xt ' ).replace( '</scripref>', '\\x*' ) ##if '\\' in vText: print( 'VPL vText', repr(vText) ) #if vplType == 4: # Forge for SwordSearcher ##print( BBB, chapterNumber, verseNumber, repr(vText) ) ## Convert {stuff} to footnotes #match = re.search( '\\{(.+?)\\}', vText ) #while match: #footnoteText = '\\f + \\fr {}:{} \\ft {}\\f*'.format( chapterNumber, verseNumber, match.group(1) ) #vText = vText[:match.start()] + footnoteText + vText[match.end():] # Replace this footnote ##print( BBB, chapterNumber, verseNumber, repr(vText) ) #match = re.search( '\\{(.+?)\\}', vText ) ## Convert [stuff] to added fields #match = re.search( '\\[(.+?)\\]', vText ) #while match: #addText = '\\add {}\\add*'.format( match.group(1) ) #vText = vText[:match.start()] + addText + vText[match.end():] # Replace this chunk ##print( BBB, chapterNumber, verseNumber, repr(vText) ) #match = re.search( '\\[(.+?)\\]', vText ) #for badChar in '{}[]': #if badChar in vText: #logging.warning( "Found remaining braces or brackets in SwordSearcher Forge VPL {} {}:{} {!r}".format( BBB, chapterNumberString, verseNumberString, vText ) ) #break else: logging.critical('Unknown VPL type {}'.format(vplType)) if BibleOrgSysGlobals.debugFlag and debuggingThisModule: halt if bookCodeText: if bookCodeText != lastBookCodeText: # We've started a new book if lastBookCodeText is not None: # Better save the last book self.stashBook(thisBook) if BBB: if BBB in self: logging.critical( "Have duplicated {} book in {}".format( self.givenName, BBB)) if BibleOrgSysGlobals.debugFlag: assert BBB not in self thisBook = BibleBook(self, BBB) thisBook.objectNameString = 'VPL Bible Book object' thisBook.objectTypeString = 'VPL' verseList = BOSx.getNumVersesList(BBB) numChapters, numVerses = len( verseList), verseList[0] lastBookCodeText = bookCodeText lastChapterNumber = lastVerseNumber = -1 else: logging.critical( "VPLBible{} could not figure out {!r} book code" .format(vplType, bookCodeText)) if BibleOrgSysGlobals.debugFlag: halt if BBB: if chapterNumber != lastChapterNumber: # We've started a new chapter if BibleOrgSysGlobals.debugFlag: assert chapterNumber > lastChapterNumber or BBB == 'ESG' # Esther Greek might be an exception if chapterNumber == 0: logging.info( "Have chapter zero in {} {} {} {}:{}". format(self.givenName, BBB, bookCodeText, chapterNumberString, verseNumberString)) elif chapterNumber > numChapters: logging.error( "Have high chapter number in {} {} {} {}:{} (expected max of {})" .format(self.givenName, BBB, bookCodeText, chapterNumberString, verseNumberString, numChapters)) thisBook.addLine('c', chapterNumberString) lastChapterNumber = chapterNumber lastVerseNumber = -1 # Handle the verse info if verseNumber == lastVerseNumber and vText == lastVText: logging.warning( _("Ignored duplicate verse line in {} {} {} {}:{}" ).format(self.givenName, BBB, bookCodeText, chapterNumberString, verseNumberString)) continue if verseNumber < lastVerseNumber: logging.warning( _("Ignored receding verse number (from {} to {}) in {} {} {} {}:{}" ).format(lastVerseNumber, verseNumber, self.givenName, BBB, bookCodeText, chapterNumberString, verseNumberString)) elif verseNumber == lastVerseNumber: if vText == lastVText: logging.warning( _("Ignored duplicated {} verse in {} {} {} {}:{}" ).format(verseNumber, self.givenName, BBB, bookCodeText, chapterNumberString, verseNumberString)) else: logging.warning( _("Ignored duplicated {} verse number in {} {} {} {}:{}" ).format(verseNumber, self.givenName, BBB, bookCodeText, chapterNumberString, verseNumberString)) # Check for paragraph markers if vText and vText[0] == '¶': thisBook.addLine('p', '') vText = vText[1:].lstrip() #print( '{} {}:{} = {!r}'.format( BBB, chapterNumberString, verseNumberString, vText ) ) thisBook.addLine('v', verseNumberString + ' ' + vText) lastVText = vText lastVerseNumber = verseNumber else: # No bookCodeText yet logging.warning( "VPLBible.load{} is skipping unknown pre-book line: {}" .format(vplType, line)) # Save the final book if thisBook is not None: self.stashBook(thisBook) # Clean up if settingsDict: #print( "VPL settingsDict", settingsDict ) if self.suppliedMetadata is None: self.suppliedMetadata = {} self.suppliedMetadata['VPL'] = settingsDict self.applySuppliedMetadata('VPL') # Copy some to self.settingsDict self.doPostLoadProcessing()
def load( self ): """ Load a single source file and load book elements. """ if BibleOrgSysGlobals.verbosityLevel > 2: print( _("Loading {}…").format( self.sourceFilepath ) ) global BOS66, BOS81, BOSx if BOS66 is None: BOS66 = BibleOrganizationalSystem( 'GENERIC-KJV-66-ENG' ) if BOS81 is None: BOS81 = BibleOrganizationalSystem( 'GENERIC-KJV-81-ENG' ) if BOSx is None: BOSx = BibleOrganizationalSystem( 'GENERIC-ENG' ) if self.suppliedMetadata is None: self.suppliedMetadata = {} lastLine, lineCount = '', 0 bookCode = BBB = metadataName = None lastBookCode = lastChapterNumber = lastVerseNumber = -1 lastVText = '' thisBook = None settingsDict = {} with open( self.sourceFilepath, encoding=self.encoding ) as myFile: # Automatically closes the file when done for line in myFile: lineCount += 1 if line[-1]=='\n': line=line[:-1] # Removing trailing newline character if not line: continue # Just discard blank lines if lineCount==1: if self.encoding.lower()=='utf-8' and line[0]==chr(65279): #U+FEFF or \ufeff logging.info( " ForgeForSwordSearcherBible.load: Detected Unicode Byte Order Marker (BOM)" ) line = line[1:] # Remove the Unicode Byte Order Marker (BOM) match = re.search( '^; TITLE:\\s', line ) if match: if BibleOrgSysGlobals.debugFlag: print( "First line got type {!r} match from {!r}".format( match.group(0), line ) ) else: if BibleOrgSysGlobals.verbosityLevel > 2: print( "ForgeForSwordSearcherBible.load: (unexpected) first line was {!r} in {}".format( firstLine, thisFilename ) ) if BibleOrgSysGlobals.debugFlag and debuggingThisModule: halt continue #print ( 'ForgeForSwordSearcher file line is "' + line + '"' ) lastLine = line # Process header stuff if line.startswith( '; TITLE:' ): string = line[8:].strip() if string: settingsDict['TITLE'] = string continue elif line.startswith( '; ABBREVIATION:' ): string = line[15:].strip() if string: settingsDict['ABBREVIATION'] = string continue elif line.startswith( '; HAS ITALICS' ): string = line[14:].strip() if string: settingsDict['HAS_ITALICS'] = string continue elif line.startswith( '; HAS FOOTNOTES:' ): string = line[15:].strip() if string: settingsDict['HAS_FOOTNOTES'] = string continue elif line.startswith( '; HAS FOOTNOTES' ): string = line[14:].strip() if string: settingsDict['HAS_FOOTNOTES'] = string continue elif line.startswith( '; HAS REDLETTER' ): string = line[14:].strip() if string: settingsDict['HAS_REDLETTER'] = string continue elif line[0]==';': logging.warning( "ForgeForSwordSearcherBible.load is skipping unknown header/comment line: {}".format( line ) ) continue # Just discard comment lines # Process the main segment if line.startswith( '$$ ' ): if metadataName and metadataContents: settingsDict[metadataName] = metadataContents metadataName = None pointer = line[3:] #print( "pointer", repr(pointer) ) if pointer and pointer[0]=='{' and pointer[-1]=='}': metadataName = pointer[1:-1] if metadataName: #print( "metadataName", repr(metadataName) ) metadataContents = '' else: # let's assume it's a BCV reference pointer = pointer.replace( '1 K','1K' ).replace( '2 K','2K' ) \ .replace( '1 Chr','1Chr' ).replace( '2 Chr','2Chr' ) \ .replace( '1 Cor','1Cor' ).replace( '2 Cor','2Cor' ) \ .replace( '1 Thess','1Thess' ).replace( '2 Thess','2Thess' ) \ .replace( '1 Tim','1Tim' ).replace( '2 Tim','2Tim' ) \ .replace( '1 Pet','1Pet' ).replace( '2 Pet','2Pet' ) \ .replace( '1 J','1J' ).replace( '2 J','2J' ).replace( '3 J','3J' ) B_CV_Bits = pointer.split( ' ', 1 ) if len(B_CV_Bits) == 2 and ':' in B_CV_Bits[1]: bookCode, CVString = B_CV_Bits chapterNumberString, verseNumberString = CVString.split( ':' ) chapterNumber = int( chapterNumberString ) verseNumber = int( verseNumberString ) if bookCode != lastBookCode: # We've started a new book if bookCode in ('Ge',): BBB = 'GEN' elif bookCode in ('Le',): BBB = 'LEV' elif bookCode in ('La',): BBB = 'LAM' ##elif bookCode in ('Es',): BBB = 'EST' ##elif bookCode in ('Pr',): BBB = 'PRO' #elif bookCode in ('So',): BBB = 'SNG' #elif bookCode in ('La',): BBB = 'LAM' #elif bookCode in ('Jude',): BBB = 'JDE' else: #print( "4BookCode =", repr(bookCode) ) #BBB = BOS.getBBBFromText( bookCode ) # Try to guess BBB = BOS66.getBBBFromText( bookCode ) # Try to guess if not BBB: BBB = BOS81.getBBBFromText( bookCode ) # Try to guess if not BBB: BBB = BOSx.getBBBFromText( bookCode ) # Try to guess #print( "4BBB =", repr(BBB) ) else: print( "Unexpected number of bits", self.givenName, BBB, bookCode, chapterNumberString, verseNumberString, len(bits), bits ) continue # Just save the pointer information which refers to the text on the next line else: # it's not a $$ line text = line #print( "text", repr(text) ) if metadataName: metadataContents += ('\n' if metadataContents else '') + text continue else: vText = text # Handle bits like (<scripref>Pr 2:7</scripref>) vText = vText.replace( '(<scripref>', '\\x - \\xt ' ).replace( '</scripref>)', '\\x*' ) vText = vText.replace( '<scripref>', '\\x - \\xt ' ).replace( '</scripref>', '\\x*' ) #if '\\' in vText: print( 'ForgeForSwordSearcher vText', repr(vText) ) #print( BBB, chapterNumber, verseNumber, repr(vText) ) # Convert {stuff} to footnotes match = re.search( '\\{(.+?)\\}', vText ) while match: footnoteText = '\\f + \\fr {}:{} \\ft {}\\f*'.format( chapterNumber, verseNumber, match.group(1) ) vText = vText[:match.start()] + footnoteText + vText[match.end():] # Replace this footnote #print( BBB, chapterNumber, verseNumber, repr(vText) ) match = re.search( '\\{(.+?)\\}', vText ) # Convert [stuff] to added fields match = re.search( '\\[(.+?)\\]', vText ) while match: addText = '\\add {}\\add*'.format( match.group(1) ) vText = vText[:match.start()] + addText + vText[match.end():] # Replace this chunk #print( BBB, chapterNumber, verseNumber, repr(vText) ) match = re.search( '\\[(.+?)\\]', vText ) # Convert +r/This text is red-letter-r/ to wj fields match = re.search( '\\+r/(.+?)-r/', vText ) while match: addText = '\\wj {}\\wj*'.format( match.group(1) ) vText = vText[:match.start()] + addText + vText[match.end():] # Replace this chunk #print( BBB, chapterNumber, verseNumber, repr(vText) ) match = re.search( '\\+r/(.+?)-r/', vText ) # Final check for unexpected remaining formatting for badChar in '{}[]/': if badChar in vText: logging.warning( "Found remaining braces,brackets or slashes in SwordSearcher Forge VPL {} {}:{} {!r}".format( BBB, chapterNumberString, verseNumberString, vText ) ) break if bookCode: if bookCode != lastBookCode: # We've started a new book if lastBookCode != -1: # Better save the last book self.stashBook( thisBook ) if BBB: if BBB in self: logging.critical( "Have duplicated {} book in {}".format( self.givenName, BBB ) ) if BibleOrgSysGlobals.debugFlag: assert BBB not in self thisBook = BibleBook( self, BBB ) thisBook.objectNameString = 'ForgeForSwordSearcher Bible Book object' thisBook.objectTypeString = 'ForgeForSwordSearcher' verseList = BOSx.getNumVersesList( BBB ) numChapters, numVerses = len(verseList), verseList[0] lastBookCode = bookCode lastChapterNumber = lastVerseNumber = -1 else: logging.critical( "ForgeForSwordSearcherBible could not figure out {!r} book code".format( bookCode ) ) if BibleOrgSysGlobals.debugFlag: halt if BBB: if chapterNumber != lastChapterNumber: # We've started a new chapter if BibleOrgSysGlobals.debugFlag: assert chapterNumber > lastChapterNumber or BBB=='ESG' # Esther Greek might be an exception if chapterNumber == 0: logging.info( "Have chapter zero in {} {} {} {}:{}".format( self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) elif chapterNumber > numChapters: logging.error( "Have high chapter number in {} {} {} {}:{} (expected max of {})".format( self.givenName, BBB, bookCode, chapterNumberString, verseNumberString, numChapters ) ) thisBook.addLine( 'c', chapterNumberString ) lastChapterNumber = chapterNumber lastVerseNumber = -1 # Handle the verse info if verseNumber==lastVerseNumber and vText==lastVText: logging.warning( _("Ignored duplicate verse line in {} {} {} {}:{}").format( self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) continue if verseNumber < lastVerseNumber: logging.warning( _("Ignored receding verse number (from {} to {}) in {} {} {} {}:{}").format( lastVerseNumber, verseNumber, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) elif verseNumber == lastVerseNumber: if vText == lastVText: logging.warning( _("Ignored duplicated {} verse in {} {} {} {}:{}").format( verseNumber, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) else: logging.warning( _("Ignored duplicated {} verse number in {} {} {} {}:{}").format( verseNumber, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) # Check for paragraph markers if vText and vText[0]=='¶': thisBook.addLine( 'p', '' ) vText = vText[1:].lstrip() #print( '{} {}:{} = {!r}'.format( BBB, chapterNumberString, verseNumberString, vText ) ) thisBook.addLine( 'v', verseNumberString + ' ' + vText ) lastVText = vText lastVerseNumber = verseNumber else: # No bookCode yet logging.warning( "ForgeForSwordSearcherBible.load is skipping unknown pre-book line: {}".format( line ) ) # Save the final book if thisBook is not None: self.stashBook( thisBook ) # Clean up if settingsDict: #print( "ForgeForSwordSearcher settingsDict", settingsDict ) if self.suppliedMetadata is None: self.suppliedMetadata = {} self.suppliedMetadata['Forge4SS'] = settingsDict self.applySuppliedMetadata( 'Forge4SS' ) # Copy some to self.settingsDict self.doPostLoadProcessing()
class HaggaiXMLBible( Bible ): """ Class for reading, validating, and converting HaggaiXMLBible XML. """ XMLNameSpace = "{http://www.w3.org/2001/XMLSchema-instance}" treeTag = 'XMLBIBLE' infoTag = 'INFORMATION' bookTag = 'BIBLEBOOK' chapterTag = 'CHAPTER' captionTag = 'CAPTION' paragraphTag = 'PARAGRAPH' verseTag = 'VERSE' noteTag = 'NOTE' styleTag = 'STYLE' breakTag = 'BR' def __init__( self, sourceFolder, givenName, encoding='utf-8' ): """ Constructor: just sets up the Haggai Bible object. """ # Setup and initialise the base class first Bible.__init__( self ) self.objectNameString = 'Haggai XML Bible object' self.objectTypeString = 'Haggai' # Now we can set our object variables self.sourceFolder, self.givenName, self.encoding = sourceFolder, givenName, encoding self.sourceFilepath = os.path.join( self.sourceFolder, self.givenName ) self.XMLTree = self.header = None # Will hold the XML data # Get the data tables that we need for proper checking #self.ISOLanguages = ISO_639_3_Languages().loadData() self.genericBOS = BibleOrganizationalSystem( 'GENERIC-KJV-66-ENG' ) # Do a preliminary check on the readability of our file if not os.access( self.sourceFilepath, os.R_OK ): print( "HaggaiXMLBible: File {!r} is unreadable".format( self.sourceFilepath ) ) self.name = self.givenName #if self.name is None: #pass # end of HaggaiXMLBible.__init__ def load( self ): """ Load a single source XML file and load book elements. """ if BibleOrgSysGlobals.verbosityLevel > 2: print( _("Loading {}…").format( self.sourceFilepath ) ) try: self.XMLTree = ElementTree().parse( self.sourceFilepath ) except ParseError as err: logging.critical( exp("Loader parse error in xml file {}: {} {}").format( self.givenName, sys.exc_info()[0], err ) ) #loadErrors.append( exp("Loader parse error in xml file {}: {} {}").format( self.givenName, sys.exc_info()[0], err ) ) #self.addPriorityError( 100, C, V, _("Loader parse error in xml file {}: {}").format( self.givenName, err ) ) if BibleOrgSysGlobals.debugFlag: assert len( self.XMLTree ) # Fail here if we didn't load anything at all # Find the main (bible) container if self.XMLTree.tag == HaggaiXMLBible.treeTag: location = "Haggai XML file" BibleOrgSysGlobals.checkXMLNoText( self.XMLTree, location, '4f6h' ) BibleOrgSysGlobals.checkXMLNoTail( self.XMLTree, location, '1wk8' ) schema = name = status = BibleType = revision = version = lgid = None for attrib,value in self.XMLTree.items(): if attrib == HaggaiXMLBible.XMLNameSpace + 'noNamespaceSchemaLocation': schema = value elif attrib == "biblename": name = value elif attrib == "lgid": lgid = value # In italian.xml this is set to "german" elif attrib == "status": status = value elif attrib == "type": BibleType = value elif attrib == "revision": revision = value elif attrib == 'version': version = value else: logging.warning( "Unprocessed {!r} attribute ({}) in main element".format( attrib, value ) ) if name: self.name = name if status: self.status = status if revision: self.revision = revision if version: self.version = version if self.XMLTree[0].tag == 'INFORMATION': self.header = self.XMLTree[0] self.XMLTree.remove( self.header ) self.__validateAndExtractHeader() else: # Handle information records at the END of the file ix = len(self.XMLTree) - 1 if self.XMLTree[ix].tag == 'INFORMATION': self.header = self.XMLTree[ix] self.XMLTree.remove( self.header ) self.__validateAndExtractHeader() # Find the submain (book) containers for element in self.XMLTree: if element.tag == HaggaiXMLBible.bookTag: sublocation = "book in " + location BibleOrgSysGlobals.checkXMLNoText( element, sublocation, 'g3g5' ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'd3f6' ) self.__validateAndExtractBook( element ) else: logging.error( "Expected to find {!r} but got {!r}".format( HaggaiXMLBible.bookTag, element.tag ) ) else: logging.error( "Expected to load {!r} but got {!r}".format( HaggaiXMLBible.treeTag, self.XMLTree.tag ) ) self.doPostLoadProcessing() # end of HaggaiXMLBible.load def __validateAndExtractHeader( self ): """ Extracts information out of the header record, such as: <INFORMATION> <title>King James Version</title> <creator></creator> <subject>The Holy Bible</subject> <description>In 1604, King James I of England authorized that a new translation of the Bible into English be started. It was finished in 1611, just 85 years after the first translation of the New Testament into English appeared (Tyndale, 1526). The Authorized Version, or King James Version, quickly became the standard for English-speaking Protestants. Its flowing language and prose rhythm has had a profound influence on the literature of the past 300 years.</description> <publisher>FREE BIBLE SOFTWARE GROUP</publisher> <contributors /> <date>2009-01-23</date> <type>Bible</type> <format>Haggai XML Bible Markup Language</format> <identifier>kjv</identifier> <source>http://www.unboundbible.com/zips/index.cfm?lang=English</source> <language>ENG</language> <coverage>provide the Bible to the nations of the world</coverage> <rights>We believe that this Bible is found in the Public Domain.</rights> </INFORMATION> """ if BibleOrgSysGlobals.debugFlag: assert self.header location = 'Header' BibleOrgSysGlobals.checkXMLNoAttributes( self.header, location, 'j4j6' ) BibleOrgSysGlobals.checkXMLNoText( self.header, location, 'sk4l' ) BibleOrgSysGlobals.checkXMLNoTail( self.header, location, 'a2d4' ) # TODO: We probably need to rationalise some of the self.xxx stores for element in self.header: #print( "header", element.tag ) if element.tag == 'title': sublocation = "title in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if BibleOrgSysGlobals.debugFlag: assert element.text self.title = element.text elif element.tag == 'creator': sublocation = "creator in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if element.text: self.creator = element.text elif element.tag == 'subject': sublocation = "subject in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if element.text: self.subject = element.text elif element.tag == 'description': sublocation = "description in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if BibleOrgSysGlobals.debugFlag: assert element.text self.description = element.text elif element.tag == 'publisher': sublocation = "publisher in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if element.text: self.publisher = element.text elif element.tag == 'contributor': sublocation = "contributor in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'alj1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jjd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5gk78' ) if element.text: try: self.contributor = [ self.contributor, element.text ] # Put multiples into a list except AttributeError: self.contributor = element.text # Must be the first (and possibly only) one elif element.tag == 'contributors': sublocation = "contributors in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if element.text: self.contributors = element.text elif element.tag == 'date': sublocation = "date in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if BibleOrgSysGlobals.debugFlag: assert element.text self.date = element.text elif element.tag == 'type': sublocation = "type in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if element.text: self.documentType = element.text elif element.tag == 'format': sublocation = "format in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if BibleOrgSysGlobals.debugFlag: assert element.text if BibleOrgSysGlobals.debugFlag: assert element.text == 'Haggai XML Bible Markup Language' elif element.tag == 'identifier': sublocation = "identifier in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if BibleOrgSysGlobals.debugFlag: assert element.text self.identifier = element.text elif element.tag == 'source': sublocation = "source in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if BibleOrgSysGlobals.debugFlag: assert element.text self.source = element.text elif element.tag == 'language': sublocation = "language in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if BibleOrgSysGlobals.debugFlag: assert element.text self.language = element.text elif element.tag == 'coverage': sublocation = "coverage in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if element.text: self.coverage = element.text elif element.tag == 'rights': sublocation = "rights in {}".format( location ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, '5g78' ) if element.text: self.rights = element.text else: logging.error( "Found unexpected {!r} tag in {}".format( element.tag, location ) ) # end of HaggaiXMLBible.__validateAndExtractHeader def __validateAndExtractBook( self, book ): """ Check/validate and extract book data from the given XML book record finding chapter subelements. """ if BibleOrgSysGlobals.verbosityLevel > 3: print( _("Validating XML book…") ) # Process the div attributes first BBB = bookName = bookShortName = bookNumber = None for attrib,value in book.items(): if attrib=="bnumber": bookNumber = value elif attrib=="bname": bookName = value elif attrib=="bsname": bookShortName = value else: logging.warning( "Unprocessed {!r} attribute ({}) in book element".format( attrib, value ) ) if bookNumber: try: BBB = BibleOrgSysGlobals.BibleBooksCodes.getBBBFromReferenceNumber( bookNumber ) except KeyError: logging.warning( "Unable to deduce which book is number={}, name={}, shortName={} -- ignoring it" \ .format( bookNumber, bookName, bookShortName ) ) elif bookName: BBB = self.genericBOS.getBBBFromText( bookName ) if BBB: if BibleOrgSysGlobals.verbosityLevel > 2: print( _("Validating {} {}…").format( BBB, bookName ) ) thisBook = BibleBook( self, BBB ) thisBook.objectNameString = 'Haggai XML Bible Book object' thisBook.objectTypeString = 'Haggai' #thisBook.sourceFilepath = self.sourceFilepath for element in book: if element.tag == HaggaiXMLBible.captionTag: sublocation = "caption in {}".format( BBB ) BibleOrgSysGlobals.checkXMLNoAttributes( element, sublocation, 'jhl6' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, sublocation, 'jk21' ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'kjh6' ) thisBook.addLine( 'mt', element.text ) elif element.tag == HaggaiXMLBible.chapterTag: sublocation = "chapter in {}".format( BBB ) BibleOrgSysGlobals.checkXMLNoText( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) self.__validateAndExtractChapter( BBB, thisBook, element ) else: logging.error( "Expected to find {!r} but got {!r}".format( HaggaiXMLBible.chapterTag, element.tag ) ) if BibleOrgSysGlobals.verbosityLevel > 2: print( " Saving {} into results…".format( BBB ) ) self.stashBook( thisBook ) # end of HaggaiXMLBible.__validateAndExtractBook def __validateAndExtractChapter( self, BBB, thisBook, chapter ): """ Check/validate and extract chapter data from the given XML book record finding and saving chapter numbers and finding and saving verse elements. """ if BibleOrgSysGlobals.verbosityLevel > 3: print( _("Validating XML chapter…") ) # Process the chapter attributes first chapterNumber = numVerses = None for attrib,value in chapter.items(): if attrib=="cnumber": chapterNumber = value else: logging.warning( "Unprocessed {!r} attribute ({}) in chapter element".format( attrib, value ) ) if chapterNumber: #print( BBB, 'c', chapterNumber ) thisBook.addLine( 'c', chapterNumber ) else: logging.error( "Missing 'n' attribute in chapter element for {}".format( BBB ) ) for element in chapter: if element.tag == HaggaiXMLBible.paragraphTag: location = "paragraph in {} {}".format( BBB, chapterNumber ) self.__validateAndExtractParagraph( BBB, chapterNumber, thisBook, element ) elif element.tag == HaggaiXMLBible.verseTag+'disabled': location = "verse in {} {}".format( BBB, chapterNumber ) self.__validateAndExtractVerse( BBB, chapterNumber, thisBook, element ) elif element.tag == HaggaiXMLBible.captionTag+'disabled': # Used in Psalms location = "caption in {} {}".format( BBB, chapterNumber ) BibleOrgSysGlobals.checkXMLNoTail( element, location, 'k5k8' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'd3f5' ) # Handle caption attributes vRef = None for attrib,value in element.items(): if attrib=="vref": vRef = value if BibleOrgSysGlobals.debugFlag: assert vRef == '1' else: logging.warning( "Unprocessed {!r} attribute ({}) in caption element".format( attrib, value ) ) if BibleOrgSysGlobals.debugFlag: assert vRef vText = element.text if not vText: logging.warning( "{} {}:{} has no text".format( BBB, chapterNumber, vRef ) ) if vText: # This is the main text of the caption #print( "{} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, vText ) ) thisBook.addLine( 'v', '0' + ' ' + vText ) # We save it as verse zero else: logging.error( "Expected to find {!r} but got {!r}".format( HaggaiXMLBible.verseTag, element.tag ) ) # end of HaggaiXMLBible.__validateAndExtractChapter def __validateAndExtractParagraph( self, BBB, chapterNumber, thisBook, paragraph ): """ Check/validate and extract paragraph data from the given XML book record finding and saving paragraphs and finding and saving verse elements. """ if BibleOrgSysGlobals.verbosityLevel > 3: print( _("Validating XML paragraph…") ) location = "paragraph in {} {}".format( BBB, chapterNumber ) BibleOrgSysGlobals.checkXMLNoAttributes( paragraph, location, 'brgw3' ) BibleOrgSysGlobals.checkXMLNoText( paragraph, location, 'brgw3' ) BibleOrgSysGlobals.checkXMLNoTail( paragraph, location, 'brgw3' ) thisBook.addLine( 'p', '' ) # Handle verse subelements (verses) for element in paragraph: if element.tag == HaggaiXMLBible.verseTag: location = "verse in {} {}".format( BBB, chapterNumber ) self.__validateAndExtractVerse( BBB, chapterNumber, thisBook, element ) elif element.tag == HaggaiXMLBible.captionTag+'disabled': # Used in Psalms location = "caption in {} {}".format( BBB, chapterNumber ) BibleOrgSysGlobals.checkXMLNoTail( element, location, 'k5k8' ) BibleOrgSysGlobals.checkXMLNoSubelements( element, location, 'd3f5' ) # Handle caption attributes vRef = None for attrib,value in element.items(): if attrib=="vref": vRef = value if BibleOrgSysGlobals.debugFlag: assert vRef == '1' else: logging.warning( "Unprocessed {!r} attribute ({}) in caption element".format( attrib, value ) ) if BibleOrgSysGlobals.debugFlag: assert vRef vText = element.text if not vText: logging.warning( "{} {}:{} has no text".format( BBB, chapterNumber, vRef ) ) if vText: # This is the main text of the caption #print( "{} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, vText ) ) thisBook.addLine( 'v', '0' + ' ' + vText ) # We save it as verse zero else: logging.error( "Expected to find {!r} but got {!r}".format( HaggaiXMLBible.verseTag, element.tag ) ) # end of HaggaiXMLBible.__validateAndExtractParagraph def __validateAndExtractVerse( self, BBB, chapterNumber, thisBook, verse ): """ Check/validate and extract verse data from the given XML book record finding and saving verse elements. """ if BibleOrgSysGlobals.verbosityLevel > 3: print( _("Validating XML verse…") ) location = "verse in {} {}".format( BBB, chapterNumber ) BibleOrgSysGlobals.checkXMLNoTail( verse, location, 'l5ks' ) # Handle verse attributes verseNumber = toVerseNumber = None for attrib,value in verse.items(): if attrib=="vnumber": verseNumber = value else: logging.warning( "Unprocessed {!r} attribute ({}) in verse element".format( attrib, value ) ) if BibleOrgSysGlobals.debugFlag: assert verseNumber location = "{}:{}".format( location, verseNumber ) # Get a better location description #thisBook.addLine( 'v', verseNumber ) vText = '' if verse.text is None else verse.text if vText: vText = vText.strip() #if not vText: # This happens if a verse starts immediately with a style or note #logging.warning( "{} {}:{} has no text".format( BBB, chapterNumber, verseNumber ) ) # Handle verse subelements (notes and styled portions) for subelement in verse: if subelement.tag == HaggaiXMLBible.noteTag: sublocation = "note in " + location noteType = None for attrib,value in subelement.items(): if attrib=="type": noteType = value else: logging.warning( "Unprocessed {!r} attribute ({}) in style subelement".format( attrib, value ) ) if noteType and noteType not in ('variant',): logging.warning( "Unexpected {} note type in {}".format( noteType, BBB ) ) nText, nTail = subelement.text, subelement.tail #print( "note", BBB, chapterNumber, verseNumber, noteType, repr(nText), repr(nTail) ) vText += "\\f + \\fk {} \\ft {}\\f*".format( noteType, nText ) if noteType else "\\f + \\ft {}\\f*".format( nText ) if nTail: if '\n' in nTail: print( "HaggaiXMLBible.__validateAndExtractVerse: nTail {} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, nTail ) ) nTail = nTail.replace( '\n', ' ' ) vText += nTail for sub2element in subelement: if sub2element.tag == HaggaiXMLBible.styleTag: sub2location = "style in " + sublocation BibleOrgSysGlobals.checkXMLNoSubelements( sub2element, sub2location, 'fyt4' ) fs = css = idStyle = None for attrib,value in sub2element.items(): if attrib=='fs': fs = value #elif attrib=="css": css = value #elif attrib=="id": idStyle = value else: logging.warning( "Unprocessed {!r} attribute ({}) in style sub2element".format( attrib, value ) ) if BibleOrgSysGlobals.debugFlag: assert fs or css or idStyle SFM = None if fs == 'italic': SFM = '\\it' elif fs == 'super': SFM = '\\bdit' elif fs == 'emphasis': SFM = '\\em' else: print( "fs is", fs, "css is", css, "idStyle is", idStyle ); halt #if css == "font-style:italic": SFM = '\\it' #elif css == "font-style:italic;font-weight:bold": SFM = '\\bdit' #elif css == "color:#FF0000": SFM = '\\em' #elif css == "font-size: x-small; color:#8B8378": SFM = '\\add' #elif css is None and idStyle=='cl:divineName': SFM = '\\nd' #else: print( "css is", css, "idStyle is", idStyle ); halt sText, sTail = sub2element.text.strip(), sub2element.tail if BibleOrgSysGlobals.debugFlag: assert sText if SFM: vText += SFM+' ' + sText + SFM+'*' else: vText += '\\sc ' + '['+css+']' + sText + '\\sc* ' # Use sc for unknown styles if sTail: vText += sTail.strip() else: logging.error( "Expected to find {} but got {!r} in {}".format( HaggaiXMLBible.styleTag, sub2element.tag, sublocation ) ) elif subelement.tag == HaggaiXMLBible.styleTag: sublocation = "style in " + location BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation, 'f5gh' ) fs = css = idStyle = None for attrib,value in subelement.items(): if attrib=="fs": fs = value #elif attrib=="css": css = value #elif attrib=="id": idStyle = value else: logging.warning( "Unprocessed {!r} attribute ({}) in style subelement".format( attrib, value ) ) if BibleOrgSysGlobals.debugFlag: assert fs SFM = None if fs == 'super': SFM = '\\bdit' elif fs == 'emphasis': SFM = '\\em' else: print( "fs is", fs, "css is", css, "idStyle is", idStyle ); halt #if css == "font-style:italic": SFM = '\\it' #elif css == "font-style:italic;font-weight:bold": SFM = '\\bdit' #elif css == "color:#FF0000": SFM = '\\em' #elif css == "font-size: x-small; color:#8B8378": SFM = '\\add' #elif css is None and idStyle=='cl:divineName': SFM = '\\nd' #else: print( "css is", css, "idStyle is", idStyle ); halt sText, sTail = subelement.text.strip(), subelement.tail if BibleOrgSysGlobals.debugFlag: assert sText #print( BBB, chapterNumber, sublocation ) if SFM: vText += SFM+' ' + sText + SFM+'*' else: vText += '\\sc ' + '['+css+']' + sText + '\\sc* ' # Use sc for unknown styles if sTail: vText += sTail.strip() elif subelement.tag == HaggaiXMLBible.breakTag: sublocation = "line break in " + location BibleOrgSysGlobals.checkXMLNoText( subelement, sublocation, 'c1d4' ) BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sublocation, 'g4g8' ) art = None for attrib,value in subelement.items(): if attrib=="art": art = value else: logging.warning( "Unprocessed {!r} attribute ({}) in style subelement".format( attrib, value ) ) if BibleOrgSysGlobals.debugFlag: assert art == 'x-nl' #print( BBB, chapterNumber, verseNumber ) #assert vText if vText: thisBook.addLine( 'v', verseNumber + ' ' + vText ); verseNumber = None vText = '' thisBook.addLine( 'm', subelement.tail.strip() if subelement.tail else '' ) #bTail = subelement.tail #if bTail: vText = bTail.strip() else: logging.error( "Expected to find NOTE or STYLE but got {!r} in {}".format( subelement.tag, location ) ) if vText: # This is the main text of the verse (follows the verse milestone) if '\n' in vText: print( "HaggaiXMLBible.__validateAndExtractVerse: vText {} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, vText ) ) vText = vText.replace( '\n', ' ' ) thisBook.addLine( 'v', verseNumber + ' ' + vText ); verseNumber = None
def load( self ): """ Load a single source file and load book elements. """ if BibleOrgSysGlobals.verbosityLevel > 2: print( _("Loading {}…").format( self.sourceFilepath ) ) global BOS66, BOS81, BOSx if BOS66 is None: BOS66 = BibleOrganizationalSystem( 'GENERIC-KJV-66-ENG' ) if BOS81 is None: BOS81 = BibleOrganizationalSystem( 'GENERIC-KJV-81-ENG' ) if BOSx is None: BOSx = BibleOrganizationalSystem( 'GENERIC-ENG' ) if self.suppliedMetadata is None: self.suppliedMetadata = {} lastLine, lineCount = '', 0 vplType = bookCode = BBB = metadataName = None lastBookCode = lastChapterNumber = lastVerseNumber = -1 lastVText = '' thisBook = None settingsDict = {} with open( self.sourceFilepath, encoding=self.encoding ) as myFile: # Automatically closes the file when done for line in myFile: lineCount += 1 if line[-1]=='\n': line=line[:-1] # Removing trailing newline character if not line: continue # Just discard blank lines if lineCount==1: if self.encoding.lower()=='utf-8' and line[0]==chr(65279): #U+FEFF or \ufeff logging.info( " VPLBible.load: Detected Unicode Byte Order Marker (BOM)" ) line = line[1:] # Remove the Unicode Byte Order Marker (BOM) # Try to identify the VPL type match = re.search( '^(\\w{2,5}?)\\s(\\d{1,3})[:\\.](\\d{1,3})\\s', line ) if match: vplType = 1 else: match = re.search( '^(\\d{8})\\s', line ) if match: vplType = 2 else: match = re.search( '^# language_name:\\s', line ) if match: vplType = 3 #else: #match = re.search( '^; TITLE:\\s', line ) #if match: vplType = 4 if match: if BibleOrgSysGlobals.debugFlag: print( "First line got type #{} {!r} match from {!r}".format( vplType, match.group(0), line ) ) else: if BibleOrgSysGlobals.verbosityLevel > 2: print( "VPLBible.load: (unexpected) first line was {!r} in {}".format( line, self.sourceFilepath ) ) if BibleOrgSysGlobals.debugFlag and debuggingThisModule: halt continue #print( 'vplType', vplType ) #print ( 'VPL file line is "' + line + '"' ) lastLine = line # Process header stuff if vplType == 3: if line.startswith( '# language_name:' ): string = line[16:].strip() if string and string != 'Not available': settingsDict['LanguageName'] = string continue elif line.startswith( '# closest ISO 639-3:' ): string = line[20:].strip() if string and string != 'Not available': settingsDict['ISOLanguageCode'] = string continue elif line.startswith( '# year_short:' ): string = line[13:].strip() if string and string != 'Not available': settingsDict['Year.short'] = string continue elif line.startswith( '# year_long:' ): string = line[12:].strip() if string and string != 'Not available': settingsDict['Year.long'] = string continue elif line.startswith( '# title:' ): string = line[8:].strip() if string and string != 'Not available': settingsDict['WorkTitle'] = string continue elif line.startswith( '# URL:' ): string = line[6:].strip() if string and string != 'Not available': settingsDict['URL'] = string continue elif line.startswith( '# copyright_short:' ): string = line[18:].strip() if string and string != 'Not available': settingsDict['Copyright.short'] = string continue elif line.startswith( '# copyright_long:' ): string = line[17:].strip() if string and string != 'Not available': settingsDict['Copyright.long'] = string continue elif line[0]=='#': logging.warning( "VPLBible.load {} is skipping unknown line: {}".format( vplType, line ) ) continue # Just discard comment lines #elif vplType == 4: #if line.startswith( '; TITLE:' ): #string = line[8:].strip() #if string: settingsDict['TITLE'] = string #continue #elif line.startswith( '; ABBREVIATION:' ): #string = line[15:].strip() #if string: settingsDict['ABBREVIATION'] = string #continue #elif line.startswith( '; HAS ITALICS:' ): #string = line[15:].strip() #if string: settingsDict['HAS_ITALICS'] = string #continue #elif line.startswith( '; HAS FOOTNOTES:' ): #string = line[15:].strip() #if string: settingsDict['HAS_FOOTNOTES'] = string #continue #elif line.startswith( '; HAS FOOTNOTES' ): #string = line[14:].strip() #if string: settingsDict['HAS_FOOTNOTES'] = string #continue #elif line.startswith( '; HAS REDLETTER:' ): #string = line[15:].strip() #if string: settingsDict['HAS_REDLETTER'] = string #continue #elif line[0]==';': #logging.warning( "VPLBible.load{} is skipping unknown header/comment line: {}".format( vplType, line ) ) #continue # Just discard comment lines # Process the main segment if vplType == 1: bits = line.split( ' ', 2 ) #print( self.givenName, BBB, bits ) if len(bits) == 3 and ':' in bits[1]: bookCode, CVString, vText = bits chapterNumberString, verseNumberString = CVString.split( ':' ) else: print( "Unexpected number of bits", self.givenName, BBB, bookCode, chapterNumberString, verseNumberString, len(bits), bits ) if not bookCode and not chapterNumberString and not verseNumberString: print( "Skipping empty line in {} {} {} {}:{}".format( self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) continue if BibleOrgSysGlobals.debugFlag: assert 2 <= len(bookCode) <= 4 if BibleOrgSysGlobals.debugFlag: assert chapterNumberString.isdigit() if not verseNumberString.isdigit(): logging.error( "Invalid verse number field at {}/{} {}:{!r}".format( bookCode, BBB, chapterNumberString, verseNumberString ) ) if BibleOrgSysGlobals.debugFlag and debuggingThisModule: assert verseNumberString.isdigit() continue chapterNumber = int( chapterNumberString ) verseNumber = int( verseNumberString ) if bookCode != lastBookCode: # We've started a new book #if bookCode in ('Ge',): BBB = 'GEN' if bookCode in ('Le',): BBB = 'LEV' elif bookCode in ('Jud',): BBB = 'JDG' #elif bookCode in ('Es',): BBB = 'EST' #elif bookCode in ('Pr',): BBB = 'PRO' elif bookCode in ('So',): BBB = 'SNG' elif bookCode in ('La',): BBB = 'LAM' #elif bookCode in ('Jude',): BBB = 'JDE' else: #BBB = BibleOrgSysGlobals.BibleBooksCodes.getBBBFromText( bookCode ) # Try to guess BBB = BOS66.getBBBFromText( bookCode ) # Try to guess if not BBB: BBB = BOS81.getBBBFromText( bookCode ) # Try to guess if not BBB: BBB = BOSx.getBBBFromText( bookCode ) # Try to guess # Handle special formatting # [square-brackets] are for Italicized words # <angle-brackets> are for the Words of Christ in Red # «chevrons» are for the Titles in the Book of Psalms. vText = vText.replace( '[', '\\add ' ).replace( ']', '\\add*' ) \ .replace( '<', '\\wj ' ).replace( '>', '\\wj*' ) if vText and vText[0]=='«': #print( "Oh!", BBB, chapterNumberString, verseNumberString, repr(vText) ) if BBB=='PSA' and verseNumberString=='1': # Psalm title vBits = vText[1:].split( '»' ) #print( "vBits", vBits ) thisBook.addLine( 'd', vBits[0] ) # Psalm title vText = vBits[1].lstrip() # Handle the verse info #if verseNumber==lastVerseNumber and vText==lastVText: #logging.warning( _("Ignored duplicate verse line in {} {} {} {}:{}").format( self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) #continue if BBB=='PSA' and verseNumberString=='1' and vText.startswith('<') and self.givenName=='basic_english': # Move Psalm titles to verse zero verseNumber = 0 #if verseNumber < lastVerseNumber: #logging.warning( _("Ignored receding verse number (from {} to {}) in {} {} {} {}:{}").format( lastVerseNumber, verseNumber, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) #elif verseNumber == lastVerseNumber: #if vText == lastVText: #logging.warning( _("Ignored duplicated {} verse in {} {} {} {}:{}").format( verseNumber, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) #else: #logging.warning( _("Ignored duplicated {} verse number in {} {} {} {}:{}").format( verseNumber, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) elif vplType in (2,3): bits = line.split( '\t', 1 ) #print( self.givenName, BBB, bits ) bookNumberString, chapterNumberString, verseNumberString = bits[0][:2], bits[0][2:5], bits[0][5:] #print( bookNumberString, chapterNumberString, verseNumberString ) while len(chapterNumberString)>1 and chapterNumberString[0]=='0': chapterNumberString = chapterNumberString[1:] # Remove leading zeroes while len(verseNumberString)>1 and verseNumberString[0]=='0': verseNumberString = verseNumberString[1:] # Remove leading zeroes bookCode, chapterNumber, verseNumber = int( bookNumberString), int(chapterNumberString), int(verseNumberString) vText = bits[1].replace(' ,',',').replace(' .','.').replace(' ;',';').replace(' :',':') \ .replace(' !','!').replace(' )',')').replace(' ]',']').replace(' ”','”') \ .replace('“ ','“').replace('( ','(').replace('[ ','[') #.replace(' !','!') if bookCode != lastBookCode: # We've started a new book bnDict = { 67:'TOB', 68:'JDT', 69:'ESG', 70:'WIS', 71:'SIR', 72:'BAR', 73:'LJE', 74:'PAZ', 75:'SUS', 76:'BEL', 77:'MA1', 78:'MA2', 79:'MA3', 80:'MA4', 81:'ES1', 82:'ES2', 83:'MAN', 84:'PS2', 85:'PSS', 86:'ODE', } if 1 <= bookCode <= 66: BBB = BibleOrgSysGlobals.BibleBooksCodes.getBBBFromReferenceNumber( bookCode ) else: BBB = bnDict[bookCode] #elif vplType == 4: #if line.startswith( '$$ ' ): #if metadataName and metadataContents: #settingsDict[metadataName] = metadataContents #metadataName = None #pointer = line[3:] ##print( "pointer", repr(pointer) ) #if pointer and pointer[0]=='{' and pointer[-1]=='}': #metadataName = pointer[1:-1] #if metadataName: ##print( "metadataName", repr(metadataName) ) #metadataContents = '' #else: # let's assume it's a BCV reference #pointer = pointer.replace( '1 K','1K' ).replace( '2 K','2K' ) \ #.replace( '1 Chr','1Chr' ).replace( '2 Chr','2Chr' ) \ #.replace( '1 Cor','1Cor' ).replace( '2 Cor','2Cor' ) \ #.replace( '1 Thess','1Thess' ).replace( '2 Thess','2Thess' ) \ #.replace( '1 Tim','1Tim' ).replace( '2 Tim','2Tim' ) \ #.replace( '1 Pet','1Pet' ).replace( '2 Pet','2Pet' ) \ #.replace( '1 J','1J' ).replace( '2 J','2J' ).replace( '3 J','3J' ) #B_CV_Bits = pointer.split( ' ', 1 ) #if len(B_CV_Bits) == 2 and ':' in B_CV_Bits[1]: #bookCode, CVString = B_CV_Bits #chapterNumberString, verseNumberString = CVString.split( ':' ) #chapterNumber = int( chapterNumberString ) #verseNumber = int( verseNumberString ) #if bookCode != lastBookCode: # We've started a new book #if bookCode in ('Ge',): BBB = 'GEN' #elif bookCode in ('Le',): BBB = 'LEV' #elif bookCode in ('La',): BBB = 'LAM' #else: ##print( "4BookCode =", repr(bookCode) ) ##BBB = BOS.getBBBFromText( bookCode ) # Try to guess #BBB = BOS66.getBBBFromText( bookCode ) # Try to guess #if not BBB: BBB = BOS81.getBBBFromText( bookCode ) # Try to guess #if not BBB: BBB = BOSx.getBBBFromText( bookCode ) # Try to guess ##print( "4BBB =", repr(BBB) ) #else: print( "Unexpected number of bits", self.givenName, BBB, bookCode, chapterNumberString, verseNumberString, len(bits), bits ) #continue # Just save the pointer information which refers to the text on the next line #else: # it's not a $$ line #text = line ##print( "text", repr(text) ) #if metadataName: #metadataContents += ('\n' if metadataContents else '') + text #continue #else: #vText = text ## Handle bits like (<scripref>Pr 2:7</scripref>) #vText = vText.replace( '(<scripref>', '\\x - \\xt ' ).replace( '</scripref>)', '\\x*' ) #vText = vText.replace( '<scripref>', '\\x - \\xt ' ).replace( '</scripref>', '\\x*' ) ##if '\\' in vText: print( 'VPL vText', repr(vText) ) #if vplType == 4: # Forge for SwordSearcher ##print( BBB, chapterNumber, verseNumber, repr(vText) ) ## Convert {stuff} to footnotes #match = re.search( '\\{(.+?)\\}', vText ) #while match: #footnoteText = '\\f + \\fr {}:{} \\ft {}\\f*'.format( chapterNumber, verseNumber, match.group(1) ) #vText = vText[:match.start()] + footnoteText + vText[match.end():] # Replace this footnote ##print( BBB, chapterNumber, verseNumber, repr(vText) ) #match = re.search( '\\{(.+?)\\}', vText ) ## Convert [stuff] to added fields #match = re.search( '\\[(.+?)\\]', vText ) #while match: #addText = '\\add {}\\add*'.format( match.group(1) ) #vText = vText[:match.start()] + addText + vText[match.end():] # Replace this chunk ##print( BBB, chapterNumber, verseNumber, repr(vText) ) #match = re.search( '\\[(.+?)\\]', vText ) #for badChar in '{}[]': #if badChar in vText: #logging.warning( "Found remaining braces or brackets in SwordSearcher Forge VPL {} {}:{} {!r}".format( BBB, chapterNumberString, verseNumberString, vText ) ) #break else: logging.critical( 'Unknown VPL type {}'.format( vplType ) ) if BibleOrgSysGlobals.debugFlag and debuggingThisModule: halt if bookCode: if bookCode != lastBookCode: # We've started a new book if lastBookCode != -1: # Better save the last book self.stashBook( thisBook ) if BBB: if BBB in self: logging.critical( "Have duplicated {} book in {}".format( self.givenName, BBB ) ) if BibleOrgSysGlobals.debugFlag: assert BBB not in self thisBook = BibleBook( self, BBB ) thisBook.objectNameString = 'VPL Bible Book object' thisBook.objectTypeString = 'VPL' verseList = BOSx.getNumVersesList( BBB ) numChapters, numVerses = len(verseList), verseList[0] lastBookCode = bookCode lastChapterNumber = lastVerseNumber = -1 else: logging.critical( "VPLBible{} could not figure out {!r} book code".format( vplType, bookCode ) ) if BibleOrgSysGlobals.debugFlag: halt if BBB: if chapterNumber != lastChapterNumber: # We've started a new chapter if BibleOrgSysGlobals.debugFlag: assert chapterNumber > lastChapterNumber or BBB=='ESG' # Esther Greek might be an exception if chapterNumber == 0: logging.info( "Have chapter zero in {} {} {} {}:{}".format( self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) elif chapterNumber > numChapters: logging.error( "Have high chapter number in {} {} {} {}:{} (expected max of {})".format( self.givenName, BBB, bookCode, chapterNumberString, verseNumberString, numChapters ) ) thisBook.addLine( 'c', chapterNumberString ) lastChapterNumber = chapterNumber lastVerseNumber = -1 # Handle the verse info if verseNumber==lastVerseNumber and vText==lastVText: logging.warning( _("Ignored duplicate verse line in {} {} {} {}:{}").format( self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) continue if verseNumber < lastVerseNumber: logging.warning( _("Ignored receding verse number (from {} to {}) in {} {} {} {}:{}").format( lastVerseNumber, verseNumber, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) elif verseNumber == lastVerseNumber: if vText == lastVText: logging.warning( _("Ignored duplicated {} verse in {} {} {} {}:{}").format( verseNumber, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) else: logging.warning( _("Ignored duplicated {} verse number in {} {} {} {}:{}").format( verseNumber, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) # Check for paragraph markers if vText and vText[0]=='¶': thisBook.addLine( 'p', '' ) vText = vText[1:].lstrip() #print( '{} {}:{} = {!r}'.format( BBB, chapterNumberString, verseNumberString, vText ) ) thisBook.addLine( 'v', verseNumberString + ' ' + vText ) lastVText = vText lastVerseNumber = verseNumber else: # No bookCode yet logging.warning( "VPLBible.load{} is skipping unknown pre-book line: {}".format( vplType, line ) ) # Save the final book if thisBook is not None: self.stashBook( thisBook ) # Clean up if settingsDict: #print( "VPL settingsDict", settingsDict ) if self.suppliedMetadata is None: self.suppliedMetadata = {} self.suppliedMetadata['VPL'] = settingsDict self.applySuppliedMetadata( 'VPL' ) # Copy some to self.settingsDict self.doPostLoadProcessing()
class OpenSongXMLBible( Bible ): """ Class for reading, validating, and converting OpenSong Bible XML. """ treeTag = 'bible' bookTag = 'b' chapterTag = 'c' verseTag = 'v' def __init__( self, sourceFolder, givenName, encoding='utf-8' ): """ Constructor: just sets up the XML Bible file converter object. """ # Setup and initialise the base class first if BibleOrgSysGlobals.debugFlag: print( "OpenSongXMLBible( {}, {}, {} )".format( sourceFolder, givenName, encoding ) ) Bible.__init__( self ) self.objectNameString = 'OpenSong XML Bible object' self.objectTypeString = 'OpenSong' # Now we can set our object variables self.sourceFolder, self.givenName, self.encoding = sourceFolder, givenName, encoding self.sourceFilepath = os.path.join( self.sourceFolder, self.givenName ) self.tree = None # Will hold the XML data # Get the data tables that we need for proper checking #self.ISOLanguages = ISO_639_3_Languages().loadData() self.genericBOS = BibleOrganizationalSystem( 'GENERIC-KJV-66-ENG' ) # Do a preliminary check on the readability of our file if not os.access( self.sourceFilepath, os.R_OK ): print( "OpenSongXMLBible: File {!r} is unreadable".format( self.sourceFilepath ) ) self.name = self.givenName #if self.name is None: #pass # end of OpenSongXMLBible.__init__ def load( self ): """ Load a single source XML file and load book elements. """ if BibleOrgSysGlobals.verbosityLevel > 2: print( _("Loading {}…").format( self.sourceFilepath ) ) self.tree = ElementTree().parse( self.sourceFilepath ) if BibleOrgSysGlobals.debugFlag: assert len ( self.tree ) # Fail here if we didn't load anything at all # Find the main (bible) container if self.tree.tag == OpenSongXMLBible.treeTag: location = "XML file" BibleOrgSysGlobals.checkXMLNoText( self.tree, location, '4f6h' ) BibleOrgSysGlobals.checkXMLNoTail( self.tree, location, '1wk8' ) name = shortName = None for attrib,value in self.tree.items(): if attrib=="n": name = value elif attrib=="sn": shortName = value else: logging.warning( "Unprocessed {!r} attribute ({}) in main element".format( attrib, value ) ) # Find the submain (book) containers for element in self.tree: if element.tag == OpenSongXMLBible.bookTag: sublocation = "book in " + location BibleOrgSysGlobals.checkXMLNoText( element, sublocation, 'g3g5' ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'd3f6' ) self.__validateAndExtractBook( element ) elif element.tag == 'OT': pass elif element.tag == 'NT': pass else: logging.error( "Expected to find {!r} but got {!r}".format( OpenSongXMLBible.bookTag, element.tag ) ) else: logging.error( "Expected to load {!r} but got {!r}".format( OpenSongXMLBible.treeTag, self.tree.tag ) ) self.doPostLoadProcessing() # end of OpenSongXMLBible.load def __validateAndExtractBook( self, book ): """ Check/validate and extract book data from the given XML book record finding chapter subelements. """ global BibleBooksNames if BibleOrgSysGlobals.verbosityLevel > 3: print( _("Validating OpenSong XML book…") ) # Process the div attributes first BBB = bookName = None for attrib,value in book.items(): if attrib=="n": bookName = value else: logging.warning( "Unprocessed {!r} attribute ({}) in book element".format( attrib, value ) ) if bookName: BBB = self.genericBOS.getBBBFromText( bookName ) # Booknames are usually in English if not BBB: # wasn't English if BibleBooksNames is None: BibleBooksNames = BibleBooksNamesSystems().loadData() BBB = BibleBooksNames.getBBBFromText( bookName ) # Try non-English booknames #print( "bookName", bookName, BBB ) if BBB: if BibleOrgSysGlobals.verbosityLevel > 2: print( _("Validating {} {}…").format( BBB, bookName ) ) thisBook = BibleBook( self, BBB ) thisBook.objectNameString = 'OpenSong XML Bible Book object' thisBook.objectTypeString = 'OpenSong' #thisBook.sourceFilepath = self.sourceFilepath USFMAbbreviation = BibleOrgSysGlobals.BibleBooksCodes.getUSFMAbbreviation( BBB ) thisBook.addLine( 'id', '{} imported by {}'.format( USFMAbbreviation.upper(), ProgNameVersion ) ) thisBook.addLine( 'h', bookName ) thisBook.addLine( 'mt1', bookName ) for element in book: if element.tag == OpenSongXMLBible.chapterTag: sublocation = "chapter in {}".format( BBB ) BibleOrgSysGlobals.checkXMLNoText( element, sublocation, 'j3jd' ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d' ) self.__validateAndExtractChapter( BBB, thisBook, element ) else: logging.error( "Expected to find {!r} but got {!r}".format( OpenSongXMLBible.chapterTag, element.tag ) ) if BibleOrgSysGlobals.verbosityLevel > 2: print( " Saving {} into results…".format( BBB ) ) self.stashBook( thisBook ) else: logging.error( _("OpenSong load doesn't recognize book name: {!r}").format( bookName ) ) # no BBB else: logging.error( _("OpenSong load can't find a book name") ) # no bookName # end of OpenSongXMLBible.__validateAndExtractBook def __validateAndExtractChapter( self, BBB, thisBook, chapter ): """ Check/validate and extract chapter data from the given XML book record finding and saving chapter numbers and finding and saving verse elements. """ if BibleOrgSysGlobals.verbosityLevel > 3: print( _("Validating XML chapter…") ) # Process the div attributes first chapterNumber = numVerses = None for attrib,value in chapter.items(): if attrib=="n": chapterNumber = value elif attrib=="VERSES": numVerses = value else: logging.warning( "Unprocessed {!r} attribute ({}) in chapter element".format( attrib, value ) ) if chapterNumber: #print( BBB, 'c', chapterNumber ) chapterNumber = chapterNumber.replace( 'of Solomon ', '' ) # Fix a mistake in the Chinese_SU module thisBook.addLine( 'c', chapterNumber ) else: logging.error( "Missing 'n' attribute in chapter element for {}".format( BBB ) ) for element in chapter: if element.tag == OpenSongXMLBible.verseTag: sublocation = "verse in {} {}".format( BBB, chapterNumber ) BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'l5ks' ) verseNumber = toVerseNumber = None for attrib,value in element.items(): if attrib=="n": verseNumber = value elif attrib=="t": toVerseNumber = value else: logging.warning( "Unprocessed {!r} attribute ({}) in verse element".format( attrib, value ) ) if BibleOrgSysGlobals.debugFlag: assert verseNumber #thisBook.addLine( 'v', verseNumber ) vText = element.text if element.text else '' for subelement in element: sub2location = "{} in {}".format( subelement.tag, sublocation ) BibleOrgSysGlobals.checkXMLNoAttributes( subelement, sub2location, 'ks03' ) BibleOrgSysGlobals.checkXMLNoSubelements( subelement, sub2location, 'ks05' ) if subelement.tag == 'i': vText += '\\it {}\\it*{}'.format( subelement.text, subelement.tail ) else: logging.error( "Expected to find 'i' but got {!r}".format( subelement.tag ) ) vText += element.tail if element.tail else '' if not vText: logging.warning( "{} {}:{} has no text".format( BBB, chapterNumber, verseNumber ) ) #print( 'vText1', vText ) if vText: # This is the main text of the verse (follows the verse milestone) #print( "{} {}:{} {!r}".format( BBB, chapterNumber, verseNumber, vText ) ) if '\n' in vText: # This is how they represent poety #print( "vText", repr(vText), repr(element.text) ) for j, textBit in enumerate( vText.split( '\n' ) ): if j==0: thisBook.addLine( 'q1', '' ) thisBook.addLine( 'v', verseNumber + ' ' + textBit ) else: thisBook.addLine( 'q1', textBit ) else: # Just one verse line thisBook.addLine( 'v', verseNumber + ' ' + vText ) #print( 'vText2', vText ) else: logging.error( "Expected to find {!r} but got {!r}".format( OpenSongXMLBible.verseTag, element.tag ) )