def __validateAndExtractBook(self, book): """ Check/validate and extract book data from the given XML book record finding chapter subelements. """ if Globals.verbosityLevel > 3: print(_("Validating XML book...")) # Process the div attributes first BBB = bookName = bookShortName = bookNumber = None for attrib, value in book.items(): if attrib == "bnumber": bookNumber = value elif attrib == "bname": bookName = value elif attrib == "bsname": bookShortName = value else: logging.warning( "Unprocessed '{}' attribute ({}) in book element".format( attrib, value)) if bookNumber: try: BBB = Globals.BibleBooksCodes.getBBBFromReferenceNumber( bookNumber) except KeyError: logging.warning( "Unable to deduce which book is number={}, name={}, shortName={} -- ignoring it" \ .format( bookNumber, bookName, bookShortName ) ) elif bookName: BBB = self.genericBOS.getBBB(bookName) if BBB: if Globals.verbosityLevel > 2: print(_("Validating {} {}...").format(BBB, bookName)) thisBook = BibleBook(self, BBB) thisBook.objectNameString = "Haggai XML Bible Book object" thisBook.objectTypeString = "Haggai" #thisBook.sourceFilepath = self.sourceFilepath for element in book: if element.tag == HaggaiXMLBible.captionTag: sublocation = "caption in {}".format(BBB) Globals.checkXMLNoAttributes(element, sublocation, 'jhl6') Globals.checkXMLNoSubelements(element, sublocation, 'jk21') Globals.checkXMLNoTail(element, sublocation, 'kjh6') thisBook.appendLine('mt', element.text) elif element.tag == HaggaiXMLBible.chapterTag: sublocation = "chapter in {}".format(BBB) Globals.checkXMLNoText(element, sublocation, 'j3jd') Globals.checkXMLNoTail(element, sublocation, 'al1d') self.__validateAndExtractChapter(BBB, thisBook, element) else: logging.error("Expected to find '{}' but got '{}'".format( HaggaiXMLBible.chapterTag, element.tag)) if Globals.verbosityLevel > 2: print(" Saving {} into results...".format(BBB)) self.saveBook(thisBook)
def __validateAndExtractBook( self, book ): """ Check/validate and extract book data from the given XML book record finding chapter subelements. """ if Globals.verbosityLevel > 3: print( _("Validating XML book...") ) # Process the div attributes first BBB = bookName = bookShortName = bookNumber = None for attrib,value in book.items(): if attrib=="bnumber": bookNumber = value elif attrib=="bname": bookName = value elif attrib=="bsname": bookShortName = value else: logging.warning( "Unprocessed '{}' attribute ({}) in book element".format( attrib, value ) ) if bookNumber: try: BBB = Globals.BibleBooksCodes.getBBBFromReferenceNumber( bookNumber ) except KeyError: logging.warning( "Unable to deduce which book is number={}, name={}, shortName={} -- ignoring it" \ .format( bookNumber, bookName, bookShortName ) ) elif bookName: BBB = self.genericBOS.getBBB( bookName ) if BBB: if Globals.verbosityLevel > 2: print( _("Validating {} {}...").format( BBB, bookName ) ) thisBook = BibleBook( self, BBB ) thisBook.objectNameString = "Haggai XML Bible Book object" thisBook.objectTypeString = "Haggai" #thisBook.sourceFilepath = self.sourceFilepath for element in book: if element.tag == HaggaiXMLBible.captionTag: sublocation = "caption in {}".format( BBB ) Globals.checkXMLNoAttributes( element, sublocation, 'jhl6' ) Globals.checkXMLNoSubelements( element, sublocation, 'jk21' ) Globals.checkXMLNoTail( element, sublocation, 'kjh6' ) thisBook.appendLine( 'mt', element.text ) elif element.tag == HaggaiXMLBible.chapterTag: sublocation = "chapter in {}".format( BBB ) Globals.checkXMLNoText( element, sublocation, 'j3jd' ) Globals.checkXMLNoTail( element, sublocation, 'al1d' ) self.__validateAndExtractChapter( BBB, thisBook, element ) else: logging.error( "Expected to find '{}' but got '{}'".format( HaggaiXMLBible.chapterTag, element.tag ) ) if Globals.verbosityLevel > 2: print( " Saving {} into results...".format( BBB ) ) self.saveBook( thisBook )
def __validateAndExtractBook(self, book): """ Check/validate and extract book data from the given XML book record finding chapter subelements. """ if Globals.verbosityLevel > 3: print(_("Validating OpenSong XML book...")) # Process the div attributes first BBB = bookName = None for attrib, value in book.items(): if attrib == "n": bookName = value else: logging.warning( "Unprocessed '{}' attribute ({}) in book element".format( attrib, value)) if bookName: BBB = self.genericBOS.getBBB(bookName) if BBB: if Globals.verbosityLevel > 2: print(_("Validating {} {}...").format(BBB, bookName)) thisBook = BibleBook(self, BBB) thisBook.objectNameString = "OpenSong XML Bible Book object" thisBook.objectTypeString = "OpenSong" #thisBook.sourceFilepath = self.sourceFilepath USFMAbbreviation = Globals.BibleBooksCodes.getUSFMAbbreviation( BBB) thisBook.appendLine( 'id', '{} imported by {}'.format(USFMAbbreviation.upper(), ProgNameVersion)) thisBook.appendLine('h', bookName) thisBook.appendLine('mt1', bookName) for element in book: if element.tag == OpenSongXMLBible.chapterTag: sublocation = "chapter in {}".format(BBB) Globals.checkXMLNoText(element, sublocation, 'j3jd') Globals.checkXMLNoTail(element, sublocation, 'al1d') self.__validateAndExtractChapter( BBB, thisBook, element) else: logging.error( "Expected to find '{}' but got '{}'".format( OpenSongXMLBible.chapterTag, element.tag)) if Globals.verbosityLevel > 2: print(" Saving {} into results...".format(BBB)) self.saveBook(thisBook) else: logging.error( _("OpenSong load doesn't recognize book name: '{}'"). format(bookName)) # no BBB else: logging.error( _("OpenSong load can't find a book name")) # no bookName
def __validateAndExtractBook( self, book ): """ Check/validate and extract book data from the given XML book record finding chapter subelements. """ if Globals.verbosityLevel > 3: print( _("Validating OpenSong XML book...") ) # Process the div attributes first BBB = bookName = None for attrib,value in book.items(): if attrib=="n": bookName = value else: logging.warning( "Unprocessed '{}' attribute ({}) in book element".format( attrib, value ) ) if bookName: BBB = self.genericBOS.getBBB( bookName ) if BBB: if Globals.verbosityLevel > 2: print( _("Validating {} {}...").format( BBB, bookName ) ) thisBook = BibleBook( self, BBB ) thisBook.objectNameString = "OpenSong XML Bible Book object" thisBook.objectTypeString = "OpenSong" #thisBook.sourceFilepath = self.sourceFilepath USFMAbbreviation = Globals.BibleBooksCodes.getUSFMAbbreviation( BBB ) thisBook.appendLine( 'id', '{} imported by {}'.format( USFMAbbreviation.upper(), ProgNameVersion ) ) thisBook.appendLine( 'h', bookName ) thisBook.appendLine( 'mt1', bookName ) for element in book: if element.tag == OpenSongXMLBible.chapterTag: sublocation = "chapter in {}".format( BBB ) Globals.checkXMLNoText( element, sublocation, 'j3jd' ) Globals.checkXMLNoTail( element, sublocation, 'al1d' ) self.__validateAndExtractChapter( BBB, thisBook, element ) else: logging.error( "Expected to find '{}' but got '{}'".format( OpenSongXMLBible.chapterTag, element.tag ) ) if Globals.verbosityLevel > 2: print( " Saving {} into results...".format( BBB ) ) self.saveBook( thisBook ) else: logging.error( _("OpenSong load doesn't recognize book name: '{}'").format( bookName ) ) # no BBB else: logging.error( _("OpenSong load can't find a book name") ) # no bookName
def load( self ): """ Load a single source file and load book elements. """ if Globals.verbosityLevel > 2: print( _("Loading {}...").format( self.sourceFilepath ) ) loadErrors = [] fileExtensionUpper = self.fileExtension.upper() if fileExtensionUpper not in filenameEndingsToAccept: logging.critical( "{} doesn't appear to be a e-Sword file".format( self.sourceFilename ) ) elif not self.sourceFilename.upper().endswith( BibleFilenameEndingsToAccept[0] ): logging.critical( "{} doesn't appear to be a e-Sword Bible file".format( self.sourceFilename ) ) connection = sqlite3.connect( self.sourceFilepath ) connection.row_factory = sqlite3.Row # Enable row names cursor = connection.cursor() # First get the settings cursor.execute( 'select * from Details' ) row = cursor.fetchone() for key in row.keys(): self.settingsDict[key] = row[key] #print( self.settingsDict ); halt if 'Description' in self.settingsDict and len(self.settingsDict['Description'])<40: self.name = self.settingsDict['Description'] if 'Abbreviation' in self.settingsDict: self.abbreviation = self.settingsDict['Abbreviation'] if 'encryption' in self.settingsDict: logging.critical( "{} is encrypted: level {}".format( self.sourceFilename, self.settingsDict['encryption'] ) ) # Just get some information from the file cursor.execute( 'select * from Bible' ) rows = cursor.fetchall() numRows = len(rows) if Globals.debugFlag or Globals.verbosityLevel>2: print( '{} rows found'.format( numRows ) ) BBBn1 = rows[0][0] if Globals.debugFlag or Globals.verbosityLevel>2: print( 'First book number is {}'.format( BBBn1 ) ) del rows BBB1 = None if BBBn1 <= 66: BBB1 = Globals.BibleBooksCodes.getBBBFromReferenceNumber( BBBn1 ) testament = BBB = None booksExpected = textLineCountExpected = 0 if self.settingsDict['OT'] and self.settingsDict['NT']: testament, BBB = 'BOTH', 'GEN' booksExpected, textLineCountExpected = 66, 31102 elif self.settingsDict['OT']: testament, BBB = 'OT', 'GEN' booksExpected, textLineCountExpected = 39, 23145 elif self.settingsDict['NT']: testament, BBB = 'NT', 'MAT' booksExpected, textLineCountExpected = 27, 7957 elif self.settingsDict['Abbreviation'] == 'VIN2011': # Handle encoding error logging.critical( "e-Sword settings encoding error -- no testament set: {}".format( self.settingsDict ) ) loadErrors.append( "e-Sword settings encoding error -- no testament set: {}".format( self.settingsDict ) ) testament, BBB = 'BOTH', 'GEN' booksExpected, textLineCountExpected = 66, 31102 elif self.settingsDict['Apocrypha']: # incomplete testament, BBB = 'AP', 'XXX' booksExpected, textLineCountExpected = 99, 999999 halt if not BBB: logging.critical( "e-Sword settings encoding error -- no testament set: {}".format( self.settingsDict ) ) loadErrors.append( "e-Sword settings encoding error -- no testament set: {}".format( self.settingsDict ) ) if 0: cursor.execute( 'select * from Bible' ) rows = cursor.fetchall() print( "rows", len(rows) ) for row in rows: assert( len(row) == 4 ) BBBn, C, V, text = row # First three are integers, the last is a string print( BBBn, C, V, repr(text) ) if C==2: break del rows # Takes a lot of memory if Globals.debugFlag or Globals.verbosityLevel>2: print( "Testament={} BBB={} BBB1={}, bE={}, tLCE={} nR={}".format( testament, BBB, BBB1, booksExpected, textLineCountExpected, numRows ) ) if BBB1 != BBB: logging.critical( "First book seems wrong: {} instead of {}".format( BBB1, BBB ) ) loadErrors.append( "First book seems wrong: {} instead of {}".format( BBB1, BBB ) ) if not BBB: BBB = BBB1 if numRows != textLineCountExpected: logging.critical( "Row count seems wrong: {} instead of {}".format( numRows, textLineCountExpected ) ) loadErrors.append( "Row count seems wrong: {} instead of {}".format( numRows, textLineCountExpected ) ) #halt BOS = BibleOrganizationalSystem( "GENERIC-KJV-66-ENG" ) # Create the first book thisBook = BibleBook( self.name, BBB ) thisBook.objectNameString = "e-Sword Bible Book object" thisBook.objectTypeString = "e-Sword" verseList = BOS.getNumVersesList( BBB ) numC, numV = len(verseList), verseList[0] nBBB = Globals.BibleBooksCodes.getReferenceNumber( BBB ) C = V = 1 bookCount = 0 ourGlobals = {} continued = ourGlobals['haveParagraph'] = False haveLines = False while True: cursor.execute('select Scripture from Bible where Book=? and Chapter=? and Verse=?', (nBBB,C,V) ) try: row = cursor.fetchone() line = row[0] except: # This reference is missing #print( "something wrong at", BBB, C, V ) #if Globals.debugFlag: halt #print( row ) line = None #print ( nBBB, BBB, C, V, 'e-Sw file line is "' + line + '"' ) if line is None: logging.warning( "ESwordBible.load: Found missing verse line at {} {}:{}".format( BBB, C, V ) ) else: # line is not None if not isinstance( line, str ): if 'encryption' in self.settingsDict: logging.critical( "ESwordBible.load: Unable to decrypt verse line at {} {}:{} {}".format( BBB, C, V, repr(line) ) ) break else: logging.critical( "ESwordBible.load: Probably encrypted module: Unable to decode verse line at {} {}:{} {} {}".format( BBB, C, V, repr(line), self.settingsDict ) ) break elif not line: logging.warning( "ESwordBible.load: Found blank verse line at {} {}:{}".format( BBB, C, V ) ) else: haveLines = True # Some modules end lines with \r\n or have it in the middle! # (We just ignore these for now) if '\r' in line or '\n' in line: if Globals.debugFlag: logging.warning( "ESwordBible.load: Found CR or LF characters in verse line at {} {}:{}".format( BBB, C, V ) ) #print( repr(line) ) while line and line[-1] in '\r\n': line = line[:-1] # Remove CR/LFs from the end line = line.replace( '\r\n', ' ' ).replace( '\r', ' ' ).replace( '\n', ' ' ) # Replace CR/LFs in the middle #print( "e-Sword.load", BBB, C, V, repr(line) ) self.handleLine( self.name, BBB, C, V, line, thisBook, ourGlobals ) V += 1 if V > numV: C += 1 if C > numC: # Save this book now if haveLines: if Globals.verbosityLevel > 3: print( "Saving", BBB, bookCount+1 ) self.saveBook( thisBook ) #else: print( "Not saving", BBB ) bookCount += 1 # Not the number saved but the number we attempted to process if bookCount >= booksExpected: break BBB = BOS.getNextBookCode( BBB ) # Create the next book thisBook = BibleBook( self.name, BBB ) thisBook.objectNameString = "e-Sword Bible Book object" thisBook.objectTypeString = "e-Sword" haveLines = False verseList = BOS.getNumVersesList( BBB ) numC, numV = len(verseList), verseList[0] nBBB = Globals.BibleBooksCodes.getReferenceNumber( BBB ) C = V = 1 #thisBook.appendLine( 'c', str(C) ) else: # next chapter only #thisBook.appendLine( 'c', str(C) ) numV = verseList[C-1] V = 1 if ourGlobals['haveParagraph']: thisBook.appendLine( 'p', '' ) ourGlobals['haveParagraph'] = False if Globals.strictCheckingFlag or Globals.debugFlag: self.checkForExtraMaterial( cursor, BOS ) cursor.close() if loadErrors: self.errorDictionary['Load Errors'] = loadErrors self.doPostLoadProcessing()
def load(self): """ Load a single source file and load book elements. """ if Globals.verbosityLevel > 2: print(_("Loading {}...").format(self.sourceFilepath)) def decodeVerse(encodedVerseString): """ Decodes the verse which has @ format codes. """ verseString = encodedVerseString if verseString.startswith( '@@'): # This simply means that encoding follows verseString = verseString[2:] if verseString.startswith( '@@'): # This simply means that encoding follows verseString = verseString[2:] # Paragraph markers (marked now with double backslash) verseString = verseString.replace('@^', '\\\\p ') verseString = verseString.replace('@0', '\\\\m ') verseString = verseString.replace('@1', '\\\\q1 ').replace( '@2', '\\\\q2 ').replace('@3', '\\\\q3 ').replace('@4', '\\q4 ') verseString = verseString.replace('@8', '\\\\m ') # Character markers (marked now with single backslash) verseString = verseString.replace('@6', '\\wj ').replace('@5', '\\wj*') verseString = verseString.replace('@9', '\\add ').replace( '@7', '\\add*') # or \\i ??? verseString = re.sub(r'@<f([0-9])@>@/', r'\\ff\1', verseString) verseString = re.sub(r'@<x([0-9])@>@/', r'\\xx\1', verseString) #print( repr( verseString ) ) assert ('@' not in verseString) return verseString # end of decodeVerse # Read all the lines into bookDict lastLine, lineCount = '', 0 bookNameDict, bookDict, footnoteDict, xrefDict, headingDict = OrderedDict( ), OrderedDict(), {}, {}, {} BBB = bookNumberString = chapterNumberString = verseNumberString = encodedVerseString = '' lastBBB = lastBookNumberString = lastChapterNumberString = lastVerseNumberString = None with open(self.sourceFilepath, encoding=self.encoding ) as myFile: # Automatically closes the file when done for line in myFile: lineCount += 1 #if lineCount==1 and self.encoding.lower()=='utf-8' and line[0]==chr(65279): #U+FEFF #logging.info( " YETBible.load: Detected UTF-16 Byte Order Marker" ) #line = line[1:] # Remove the UTF-8 Byte Order Marker if line[-1] == '\n': line = line[:-1] # Removing trailing newline character if not line: continue # Just discard blank lines lastLine = line #print ( 'YETBible file line is "' + line + '"' ) bits = line.split('\t') #print( self.givenName, BBB, bits ) if bits[0] == 'info': assert (len(bits) == 3) if bits[1] == 'shortName': shortName = bits[2] self.name = shortName elif bits[1] == 'longName': longName = bits[2] elif bits[1] == 'description': description = bits[2] elif bits[1] == 'locale': locale = bits[2] assert (2 <= len(locale) <= 3) if locale == 'in': locale = 'id' # Fix a quirk in the locale encoding else: logging.warning( _("YETBible: unknown {} info field in {} {} {}:{}") \ .format( repr(bits[1]), BBB, bookCode, chapterNumberString, verseNumberString ) ) continue elif bits[0] == 'book_name': assert (3 <= len(bits) <= 4) thisBBB = Globals.BibleBooksCodes.getBBBFromReferenceNumber( bits[1]) if len(bits) == 3: bookNameDict[thisBBB] = bits[2], '' elif len(bits) == 4: bookNameDict[thisBBB] = bits[2], bits[3] continue elif bits[0] == 'verse': assert (len(bits) == 5) bookNumberString, chapterNumberString, verseNumberString, encodedVerseString = bits[ 1:] if Globals.debugFlag: assert (bookNumberString.isdigit()) assert (chapterNumberString.isdigit()) assert (verseNumberString.isdigit()) BBB = Globals.BibleBooksCodes.getBBBFromReferenceNumber( bookNumberString) #print( "{} {}:{} = {}".format( BBB, chapterNumberString, verseNumberString, repr(encodedVerseString) ) ) if BBB != lastBBB: # We have a new book if lastBBB is not None: # We have a completed book to save bookDict[lastBBB] = bookLines assert (BBB in bookNameDict) bookLines = OrderedDict() # Keys are (C,V) strings verseString = decodeVerse(encodedVerseString) bookLines[(chapterNumberString, verseNumberString )] = verseString # Just store it for now lastBBB = BBB continue elif bits[0] == 'pericope': assert (len(bits) == 5) bookNumberString, chapterNumberString, verseNumberString, encodedHeadingString = bits[ 1:] if Globals.debugFlag: assert (bookNumberString.isdigit()) assert (chapterNumberString.isdigit()) assert (verseNumberString.isdigit()) BBB = Globals.BibleBooksCodes.getBBBFromReferenceNumber( bookNumberString) headingString = encodedHeadingString.replace( '@9', '\\it ').replace('@7', '\\it*') #print( repr(encodedHeadingString), repr(headingString) ) assert ('@' not in headingString) headingDict[(BBB, chapterNumberString, verseNumberString)] = headingString, [ ] # Blank refList continue elif bits[ 0] == 'parallel': # These lines optionally follow pericope lines assert (len(bits) == 2) heading, refList = headingDict[(BBB, chapterNumberString, verseNumberString)] refList.append(bits[1]) #print( "parallel2", repr(heading), refList ) headingDict[(BBB, chapterNumberString, verseNumberString)] = heading, refList continue elif bits[0] == 'xref': assert (len(bits) == 6) bookNumberString, chapterNumberString, verseNumberString, indexNumberString, encodedNoteString = bits[ 1:] if Globals.debugFlag: assert (bookNumberString.isdigit()) assert (chapterNumberString.isdigit()) assert (verseNumberString.isdigit()) assert (indexNumberString.isdigit()) BBB = Globals.BibleBooksCodes.getBBBFromReferenceNumber( bookNumberString) noteString = encodedNoteString.replace('@9', '\\it ').replace( '@7', '\\it*') noteString = re.sub( r'@<ta(.+?)@>', r'', noteString ) # Get rid of these encoded BCV references for now noteString = re.sub( r'@<to(.+?)@>', r'', noteString ) # Get rid of these OSIS BCV references for now noteString = noteString.replace('@/', '') #print( repr(encodedNoteString), repr(noteString) ) assert ('@' not in noteString) xrefDict[(BBB, chapterNumberString, verseNumberString, indexNumberString)] = noteString continue elif bits[0] == 'footnote': assert (len(bits) == 6) bookNumberString, chapterNumberString, verseNumberString, indexNumberString, encodedNoteString = bits[ 1:] if Globals.debugFlag: assert (bookNumberString.isdigit()) assert (chapterNumberString.isdigit()) assert (verseNumberString.isdigit()) assert (indexNumberString.isdigit()) BBB = Globals.BibleBooksCodes.getBBBFromReferenceNumber( bookNumberString) noteString = encodedNoteString.replace('@9', '\\it ').replace( '@7', '\\it*') assert ('@' not in noteString) footnoteDict[(BBB, chapterNumberString, verseNumberString, indexNumberString)] = noteString continue else: print("YETBible: Unknown line type", self.givenName, BBB, bookCode, chapterNumberString, verseNumberString, len(bits), bits) halt bookDict[lastBBB] = bookLines # Save the last book #if bookCode != lastBookCode: # We've started a new book #if lastBookCode != -1: # Better save the last book #self.saveBook( thisBook ) #BBB = Globals.BibleBooksCodes.getBBBFromYETBibleCode( bookCode ) #thisBook = BibleBook( self, BBB ) #thisBook.objectNameString = "YET Bible Book object" #thisBook.objectTypeString = "YET" #lastBookCode = bookCode #lastChapterNumber = lastVerseNumber = -1 #if chapterNumber != lastChapterNumber: # We've started a new chapter #if Globals.debugFlag: assert( chapterNumber > lastChapterNumber or BBB=='ESG' ) # Esther Greek might be an exception #if chapterNumber == 0: #logging.info( "Have chapter zero in {} {} {} {}:{}".format( self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) #thisBook.appendLine( 'c', chapterNumberString ) #lastChapterNumber = chapterNumber #lastVerseNumber = -1 ## Handle the verse info #if verseNumber==lastVerseNumber and vText==lastVText: #logging.warning( _("Ignored duplicate verse line in {} {} {} {}:{}").format( self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) #continue #if BBB=='PSA' and verseNumberString=='1' and vText.startswith('<') and self.givenName=='basic_english': ## Move Psalm titles to verse zero #verseNumber = 0 #if verseNumber < lastVerseNumber: #logging.warning( _("Ignored receding verse number (from {} to {}) in {} {} {} {}:{}").format( lastVerseNumber, verseNumber, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) #elif verseNumber == lastVerseNumber: #if vText == lastVText: #logging.warning( _("Ignored duplicated {} verse in {} {} {} {}:{}").format( verseNumber, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) #else: #logging.warning( _("Ignored duplicated {} verse number in {} {} {} {}:{}").format( verseNumber, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) #thisBook.appendLine( 'v', verseNumberString + ' ' + vText ) #lastVText = vText #lastVerseNumber = verseNumber # Now process the books for BBB, bkData in bookDict.items(): #print( "Processing", BBB ) thisBook = BibleBook(self, BBB) thisBook.objectNameString = "YET Bible Book object" thisBook.objectTypeString = "YET" lastChapterNumberString = None for (chapterNumberString, verseNumberString), verseString in bkData.items(): # Insert headings (can only occur before verses) if (BBB, chapterNumberString, verseNumberString) in headingDict: heading, refList = headingDict[(BBB, chapterNumberString, verseNumberString)] #print( 's', BBB, chapterNumberString, verseNumberString, repr(heading), refList ) thisBook.appendLine('s', heading) if refList: refString = "" #print( 's', BBB, chapterNumberString, verseNumberString, repr(heading), refList ) for ref in refList: refString += ('; ' if refString else '') + ref #print( 's', BBB, chapterNumberString, verseNumberString, repr(heading), refList, repr(refString) ) thisBook.appendLine('r', '(' + refString + ')') # Insert footnotes and cross-references while ('\\ff' in verseString): #print( "footnote", repr(verseString) ) fIx = verseString.index('\\ff') caller = verseString[fIx + 3] #print( "fcaller", repr(caller) ) assert (caller.isdigit()) note = footnoteDict[(BBB, chapterNumberString, verseNumberString, caller)] #print( "fnote", repr(note) ) verseString = verseString[: fIx] + '\\f + \\ft ' + note + '\\f*' + verseString[ fIx + 4:] #print( "fvS", repr(verseString) ) while ('\\xx' in verseString): #print( "xref", repr(verseString) ) fIx = verseString.index('\\xx') caller = verseString[fIx + 3] #print( "xcaller", repr(caller) ) assert (caller.isdigit()) note = xrefDict[(BBB, chapterNumberString, verseNumberString, caller)] #print( "xnote", repr(note) ) verseString = verseString[: fIx] + '\\x - \\xt ' + note + '\\x*' + verseString[ fIx + 4:] #print( "xvS", repr(verseString) ) # Save the Bible data fields if chapterNumberString != lastChapterNumberString: thisBook.appendLine('c', chapterNumberString) lastChapterNumberString = chapterNumberString #print( BBB, chapterNumberString, verseNumberString, repr(verseString) ) if verseString.startswith( '\\\\'): # It's an initial paragraph marker if verseString[3] == ' ': marker, verseString = verseString[2], verseString[4:] elif verseString[4] == ' ': marker, verseString = verseString[2:4], verseString[5:] else: halt #print( '', '\\'+marker ) thisBook.appendLine(marker, '') assert (not verseString.startswith('\\\\')) bits = verseString.split( '\\\\' ) # Split on paragraph markers (but not character markers) for j, bit in enumerate(bits): #print( "loop", j, repr(bit), repr(verseString) ) if j == 0: thisBook.appendLine( 'v', verseNumberString + ' ' + verseString.rstrip()) else: if bit[1] == ' ': marker, bit = bit[0], bit[2:] elif bit[2] == ' ': marker, bit = bit[0:2], bit[3:] else: halt #print( "mV", marker, repr(bit), repr(verseString) ) thisBook.appendLine(marker, bit.rstrip()) self.saveBook(thisBook) self.doPostLoadProcessing()
def load(self): """ Load a single source file and load book elements. """ if Globals.verbosityLevel > 2: print(_("Loading {}...").format(self.sourceFilepath)) lastLine, lineCount = '', 0 BBB = None NRSVA_bookCode = NRSVA_chapterNumberString = NRSVA_verseNumberString = None subverseNumberString = sequenceNumberString = None lastBookCode = lastChapterNumber = lastVerseNumber = lastSequence = -1 lastVText = '' with open(self.sourceFilepath, encoding=self.encoding ) as myFile: # Automatically closes the file when done for line in myFile: lineCount += 1 #if lineCount==1 and self.encoding.lower()=='utf-8' and line[0]==chr(65279): #U+FEFF #logging.info( " UnboundBible.load: Detected UTF-16 Byte Order Marker" ) #line = line[1:] # Remove the UTF-8 Byte Order Marker if line[-1] == '\n': line = line[:-1] # Removing trailing newline character if not line: continue # Just discard blank lines lastLine = line #print ( 'UB file line is "' + line + '"' ) if line[0] == '#': hashBits = line[1:].split('\t') if len(hashBits) == 2 and hashBits[ 1]: # We have some valid meta-data if hashBits[0] == 'name': self.name = hashBits[1] elif hashBits[0] == 'filetype': self.filetype = hashBits[1] elif hashBits[0] == 'copyright': self.copyright = hashBits[1] elif hashBits[0] == 'abbreviation': self.abbreviation = hashBits[1] elif hashBits[0] == 'language': self.language = hashBits[1] elif hashBits[0] == 'note': self.note = hashBits[1] elif hashBits[0] == 'columns': self.columns = hashBits[1] # Should some of these be placed into self.settingsDict??? logging.warning( "Unknown UnboundBible meta-data field '{}' = '{}'". format(hashBits[0], hashBits[1])) continue # Just discard comment lines bits = line.split('\t') #print( self.givenName, BBB, bits ) if len(bits) == 4: bookCode, chapterNumberString, verseNumberString, vText = bits elif len(bits) == 6: bookCode, chapterNumberString, verseNumberString, subverseNumberString, sequenceNumberString, vText = bits elif len(bits) == 9: NRSVA_bookCode, NRSVA_chapterNumberString, NRSVA_verseNumberString, bookCode, chapterNumberString, verseNumberString, subverseNumberString, sequenceNumberString, vText = bits elif len(bits) == 1 and self.givenName.startswith( 'lxx_a_parsing_'): logging.warning( _("Skipping bad '{}' line in {} {} {} {}:{}").format( line, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString)) continue else: print("Unexpected number of bits", self.givenName, BBB, bookCode, chapterNumberString, verseNumberString, len(bits), bits) halt if NRSVA_bookCode: assert (len(NRSVA_bookCode) == 3) if NRSVA_chapterNumberString: assert (NRSVA_chapterNumberString.isdigit()) if NRSVA_verseNumberString: assert (NRSVA_verseNumberString.isdigit()) if not bookCode and not chapterNumberString and not verseNumberString: print("Skipping empty line in {} {} {} {}:{}".format( self.givenName, BBB, bookCode, chapterNumberString, verseNumberString)) continue if Globals.debugFlag: assert (len(bookCode) == 3) if Globals.debugFlag: assert (chapterNumberString.isdigit()) if Globals.debugFlag: assert (verseNumberString.isdigit()) if subverseNumberString: logging.warning( _("subverseNumberString '{}' in {} {} {}:{}").format( subverseNumberString, BBB, bookCode, chapterNumberString, verseNumberString)) vText = vText.strip() # Remove leading and trailing spaces if not vText: continue # Just ignore blank verses I think if vText == '+': continue # Not sure what this means in basic_english JHN 1:38 chapterNumber = int(chapterNumberString) verseNumber = int(verseNumberString) if sequenceNumberString: if Globals.debugFlag: assert (sequenceNumberString.isdigit()) sequenceNumber = int(sequenceNumberString) if Globals.debugFlag: assert( sequenceNumber > lastSequence or \ self.givenName in ('gothic_latin', 'hebrew_bhs_consonants', 'hebrew_bhs_vowels', 'latvian_nt', 'ukrainian_1871',) ) # Why??? lastSequence = sequenceNumber if bookCode != lastBookCode: # We've started a new book if lastBookCode != -1: # Better save the last book self.saveBook(thisBook) BBB = Globals.BibleBooksCodes.getBBBFromUnboundBibleCode( bookCode) thisBook = BibleBook(self, BBB) thisBook.objectNameString = "Unbound Bible Book object" thisBook.objectTypeString = "Unbound" lastBookCode = bookCode lastChapterNumber = lastVerseNumber = -1 if chapterNumber != lastChapterNumber: # We've started a new chapter if Globals.debugFlag: assert (chapterNumber > lastChapterNumber or BBB == 'ESG' ) # Esther Greek might be an exception if chapterNumber == 0: logging.info( "Have chapter zero in {} {} {} {}:{}".format( self.givenName, BBB, bookCode, chapterNumberString, verseNumberString)) thisBook.appendLine('c', chapterNumberString) lastChapterNumber = chapterNumber lastVerseNumber = -1 # Handle the verse info if verseNumber == lastVerseNumber and vText == lastVText: logging.warning( _("Ignored duplicate verse line in {} {} {} {}:{}"). format(self.givenName, BBB, bookCode, chapterNumberString, verseNumberString)) continue if BBB == 'PSA' and verseNumberString == '1' and vText.startswith( '<') and self.givenName == 'basic_english': # Move Psalm titles to verse zero verseNumber = 0 if verseNumber < lastVerseNumber: logging.warning( _("Ignored receding verse number (from {} to {}) in {} {} {} {}:{}" ).format(lastVerseNumber, verseNumber, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString)) elif verseNumber == lastVerseNumber: if vText == lastVText: logging.warning( _("Ignored duplicated {} verse in {} {} {} {}:{}"). format(verseNumber, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString)) else: logging.warning( _("Ignored duplicated {} verse number in {} {} {} {}:{}" ).format(verseNumber, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString)) thisBook.appendLine('v', verseNumberString + ' ' + vText) lastVText = vText lastVerseNumber = verseNumber # Save the final book self.saveBook(thisBook) self.doPostLoadProcessing()
class GreekNT(Bible): """ Class for handling a Greek NT object (which may contain one or more Bible books) Note: BBB is used in this class to represent the three-character referenceAbbreviation. """ def __init__(self, sourceFilepath, givenName=None, encoding='utf-8'): """ Constructor: expects the filepath of the source folder. Loads (and crudely validates the file(s)) into ???. """ # Setup and initialise the base class first Bible.__init__(self) self.objectNameString = "Greek NT Bible object" self.objectTypeString = "GreekNT" # Now we can set our object variables self.sourceFilepath, self.givenName, self.encoding = sourceFilepath, givenName, encoding self.title = self.version = self.date = None self.tree = self.header = self.frontMatter = self.divs = self.divTypesString = None #self.bkData, self.USFMBooks = OrderedDict(), OrderedDict() self.lang = self.language = None # Do a preliminary check on the readability of our files self.possibleFilenames = [] if os.path.isdir( self.sourceFilepath ): # We've been given a folder -- see if we can find the files # There's no standard for OSIS xml file naming fileList = os.listdir(self.sourceFilepath) #print( len(fileList), fileList ) # First try looking for OSIS book names for filename in fileList: if filename.lower().endswith('.txt'): thisFilepath = os.path.join(self.sourceFilepath, filename) #if Globals.debugFlag: print( "Trying {}...".format( thisFilepath ) ) if os.access(thisFilepath, os.R_OK): # we can read that file self.possibleFilenames.append(filename) elif not os.access(self.sourceFilepath, os.R_OK): logging.critical("GreekNT: File '{}' is unreadable".format( self.sourceFilepath)) return # No use continuing #print( self.possibleFilenames ); halt self.name = self.givenName #gNTfc = GreekNTFileConverter( self.sourceFilepath ) # Load and process the XML #gNTfc.loadMorphGNT() #self.books = gNTfc.bookData # end of __init__ #def x__str__( self ): #""" #This method returns the string representation of a Bible book code. #@return: the name of a Bible object formatted as a string #@rtype: string #""" #result = "Greek Bible converter object" ##if self.title: result += ('\n' if result else '') + self.title ##if self.version: result += ('\n' if result else '') + "Version: {} ".format( self.version ) ##if self.date: result += ('\n' if result else '') + "Date: {}".format( self.date ) #if len(self.books)==1: #for BBB in self.books: break # Just get the first one #result += ('\n' if result else '') + " " + _("Contains one book: {}").format( BBB ) #else: result += ('\n' if result else '') + " " + _("Number of books = {}").format( len(self.books) ) #return result ## end of __str__ def load(self): if Globals.verbosityLevel > 2: print("Loading Greek NT from {}...".format(self.sourceFilepath)) for BBB in Greek.morphgntBooks: self.loadBook(BBB, Greek.morphgntFilenames[BBB]) if Globals.verbosityLevel > 3: print("{} books loaded.".format(len(self.books))) #if self.possibleFilenames: # then we possibly have multiple files, probably one for each book #for filename in self.possibleFilenames: #pathname = os.path.join( self.sourceFilepath, filename ) #self.loadBook( pathname ) #else: # most often we have all the Bible books in one file #self.loadFile( self.sourceFilepath ) self.doPostLoadProcessing() # end of load def loadBook(self, BBB, filename, encoding='utf-8'): def unpackLine(line): # Should be seven parts in the line # 0 book/chapter/verse # 1 part of speech (POS) # 2 parsing code # 3 text (including punctuation) # 4 word (with punctuation stripped) # 5 normalized word # 6 lemma # e.g., 180101 N- ----NSM- Παῦλος Παῦλος Παῦλος Παῦλος # 180102 N- ----DSF- ⸀ἀδελφῇ ἀδελφῇ ἀδελφῇ ἀδελφή # 180102 P- -------- κατ’ κατ’ κατά κατά # 180102 N- ----DSF- ἐκκλησίᾳ· ἐκκλησίᾳ ἐκκλησίᾳ ἐκκλησία bits = line.split() assert (len(bits) == 7) #print( bits ) bn, cn, vn = bits[0][0:2], bits[0][2:4], bits[0][4:6] if bn[0] == '0': bn = bn[1:] # Remove any leading zero if cn[0] == '0': cn = cn[1:] # Remove any leading zero if vn[0] == '0': vn = vn[1:] # Remove any leading zero #print( b, c, v ) POSCode = bits[1] assert (len(POSCode) == 2) assert (POSCode in Greek.POSCodes.keys()) parsingCode = bits[2] assert (len(parsingCode) == 8) #print( parsingCode ) for j, char in enumerate(parsingCode): assert (char in Greek.parsingCodes[j]) assert (parsingCode[0] in Greek.personCodes) assert (parsingCode[1] in Greek.tenseCodes) assert (parsingCode[2] in Greek.voiceCodes) assert (parsingCode[3] in Greek.modeCodes) assert (parsingCode[4] in Greek.caseCodes) assert (parsingCode[5] in Greek.numberCodes) assert (parsingCode[6] in Greek.genderCodes) assert (parsingCode[7] in Greek.degreeCodes) return ( bn, cn, vn, ), ( POSCode, parsingCode, ), ( bits[3], bits[4], bits[5], bits[6], ) # end of unpackLine self.thisBook = BibleBook(self.name, BBB) self.thisBook.objectNameString = "Morph Greek NT Bible Book object" self.thisBook.objectTypeString = "MorphGNT" filepath = os.path.join(self.sourceFilepath, filename) if Globals.verbosityLevel > 2: print(" Loading {}...".format(filename)) lastLine, lineCount = '', 0 lastC = lastV = None with open(filepath, encoding=encoding ) as myFile: # Automatically closes the file when done if 1: #try: for line in myFile: lineCount += 1 if lineCount == 1 and encoding.lower( ) == 'utf-8' and line and line[0] == chr(65279): #U+FEFF logging.info( "GreekNT: Detected UTF-16 Byte Order Marker in {}". format(filename)) line = line[1:] # Remove the UTF-8 Byte Order Marker if line[-1] == '\n': line = line[:-1] # Removing trailing newline character #if not line: continue # Just discard blank lines lastLine = line #print ( 'gNT file line is "' + line + '"' ) #if line[0]=='#': continue # Just discard comment lines unpackedLine = unpackLine(line) #print( unpackedLine ) ref, grammar, words = unpackedLine bn, cn, vn = ref POSCode, parsingCode = grammar word1, word2, word3, word4 = words if cn != lastC: self.thisBook.appendLine('c', cn) lastC, lastV = cn, None if vn != lastV: self.thisBook.appendLine('v', vn) lastV = vn self.thisBook.appendLine( 'vw', "{}/{}/{}/{}".format(word1, word2, word3, word4)) self.thisBook.appendLine( 'g', "{}/{}".format(POSCode, parsingCode)) #reference = BBB,bits[0][1],bits[0][2], # Put the BBB into the reference #lineTuples.append( (reference,bits[1],bits[2],) ) #print( reference,bits[1],bits[2] ); halt if 0: #except: logging.critical("Invalid line in " + filepath + " -- line ignored at " + str(lineCount)) if lineCount > 1: print('Previous line was: ', lastLine) else: print('Possible encoding error -- expected', encoding) if self.thisBook: if Globals.verbosityLevel > 3: print(" {} words loaded from {}".format( len(self.thisBook), filename)) self.saveBook(self.thisBook) #self.books[BBB] = self.thisBook # end of loadBook def xanalyzeWords(self): """ Go through the NT data and do some filing and sorting of the Greek words. """ if Globals.verbosityLevel > 3: print("analyzeWords: have {} books in the loaded NT".format( len(self.books))) self.wordCounts = {} # Wordcount organized by BBB self.wordCounts['Total'] = 0 self.actualWordsToNormalized, self.normalizedWordsToActual, self.normalizedWordsToParsing, self.lemmasToNormalizedWords = {}, {}, {}, {} for BBB in self.books: wordCount = len(self.books[BBB]) self.wordCounts[BBB] = wordCount self.wordCounts['Total'] += wordCount if Globals.verbosityLevel > 3: print(" analyzeWords: {} has {} Greek words".format( BBB, wordCount)) for reference, parsing, ( punctuatedWord, actualWord, normalizedWord, lemma ) in self.books[BBB]: # Stuff is: reference,parsing,words # File the actual words if actualWord not in self.actualWordsToNormalized: self.actualWordsToNormalized[actualWord] = [( [reference], normalizedWord, )] #print( "Saved", actualWord, "with", self.actualWordsToNormalized[actualWord] ) else: # we've already had this word before previous = self.actualWordsToNormalized[actualWord] #print( "had", actualWord, "before with", previous, "now with", reference, normalizedWord ) found = changed = False newList = [] for oldRefList, oldnormalizedWord in previous: #print( " oRL", oldRefList, "oP", oldnormalizedWord ) if normalizedWord == oldnormalizedWord: assert (not found) if reference not in oldRefList: oldRefList.append(reference) newList.append(( oldRefList, oldnormalizedWord, )) changed = True found = True else: newList.append(( oldRefList, oldnormalizedWord, )) if not found: #print( " Found a new", normalizedWord, "normalized word for", actualWord, "was", previous ) newList.append(( [reference], normalizedWord, )) changed = True if changed: self.actualWordsToNormalized[actualWord] = newList #print( " now have", newList ) # File the normalized words if normalizedWord not in self.normalizedWordsToActual: self.normalizedWordsToActual[normalizedWord] = [( [reference], actualWord, )] #print( "Saved", normalizedWord, "with", self.normalizedWordsToActual[normalizedWord] ) else: # we've already had this word before previous = self.normalizedWordsToActual[normalizedWord] #print( "had", normalizedWord, "before with", previous, "now with", reference, actualWord ) found = changed = False newList = [] for oldRefList, oldActualWord in previous: #print( " oRL", oldRefList, "oP", oldActualWord ) if actualWord == oldActualWord: assert (not found) if reference not in oldRefList: oldRefList.append(reference) newList.append(( oldRefList, oldActualWord, )) changed = True found = True else: newList.append(( oldRefList, oldActualWord, )) if not found: newList.append(( [reference], actualWord, )) changed = True if changed: self.normalizedWordsToActual[normalizedWord] = newList #print( " now have", newList ) if normalizedWord not in self.normalizedWordsToParsing: self.normalizedWordsToParsing[normalizedWord] = [( [reference], parsing, )] #print( "Saved", normalizedWord, "with", self.normalizedWordsToParsing[normalizedWord] ) else: # we've already had this word before previous = self.normalizedWordsToParsing[normalizedWord] #print( "had", normalizedWord, "before with", previous, "now with", reference, parsing ) found = changed = False newList = [] for oldRefList, oldParsing in previous: #print( " oRL", oldRefList, "oP", oldParsing ) if parsing == oldParsing: assert (not found) if reference not in oldRefList: oldRefList.append(reference) newList.append(( oldRefList, oldParsing, )) changed = True found = True else: newList.append(( oldRefList, oldParsing, )) if not found: newList.append(( [reference], parsing, )) changed = True if changed: self.normalizedWordsToParsing[normalizedWord] = newList #print( " now have", newList ) # File the self.lemmasToNormalizedWords if lemma not in self.lemmasToNormalizedWords: self.lemmasToNormalizedWords[lemma] = [( [reference], normalizedWord, )] #print( "Saved", lemma, "with", self.lemmasToNormalizedWords[lemma] ) else: # we've already had this word before previous = self.lemmasToNormalizedWords[lemma] #print( "had", lemma, "before with", previous, "now with", reference, normalizedWord ) found = changed = False newList = [] for oldRefList, oldnormalizedWord in previous: #print( " oRL", oldRefList, "oP", oldnormalizedWord ) if normalizedWord == oldnormalizedWord: assert (not found) if reference not in oldRefList: oldRefList.append(reference) newList.append(( oldRefList, oldnormalizedWord, )) changed = True found = True else: newList.append(( oldRefList, oldnormalizedWord, )) if not found: newList.append(( [reference], normalizedWord, )) changed = True if changed: self.lemmasToNormalizedWords[lemma] = newList #print( " now have", newList ) if Globals.verbosityLevel > 2: print("analyzeWords: NT has {} Greek words".format( self.wordCounts['Total'])) if Globals.verbosityLevel > 2: print("analyzeWords: NT has {} actual Greek words".format( len(self.actualWordsToNormalized))) if Globals.verbosityLevel > 3: for j, aW in enumerate(self.actualWordsToNormalized.keys()): print(" ", aW, self.actualWordsToNormalized[aW]) if j == 6: break if Globals.verbosityLevel > 2: print("analyzeWords: NT has {} normalized Greek words".format( len(self.normalizedWordsToActual))) if Globals.verbosityLevel > 3: for j, nW in enumerate(self.normalizedWordsToActual.keys()): print(" ", nW, self.normalizedWordsToActual[nW]) if j == 6: break if Globals.verbosityLevel > 2: print("analyzeWords: NT has {} normalized Greek words".format( len(self.normalizedWordsToParsing))) if Globals.verbosityLevel > 3: for j, nW in enumerate(self.normalizedWordsToParsing.keys()): print(" ", nW, self.normalizedWordsToParsing[nW]) if j == 6: break if Globals.verbosityLevel > 2: print("analyzeWords: NT has {} Greek self.lemmasToNormalizedWords". format(len(self.lemmasToNormalizedWords))) if Globals.verbosityLevel > 3: for j, lem in enumerate(self.lemmasToNormalizedWords.keys()): print(" ", lem, self.lemmasToNormalizedWords[lem]) if j == 6: break if 0: print("The following actual words have multiple normalized forms:") for j, aW in enumerate(self.actualWordsToNormalized.keys()): if len(self.actualWordsToNormalized[aW]) > 1: print(" ", aW) for entry in self.actualWordsToNormalized[aW]: print(" ", entry[1], self.normalizedWordsToParsing[entry[1]], entry[0])
def load(self): """ Load a single source file and load book elements. """ if Globals.verbosityLevel > 2: print(_("Loading {}...").format(self.sourceFilepath)) status = 0 # 1 = getting chapters, 2 = getting verse data lastLine, lineCount = '', 0 BBB = lastBBB = None bookDetails = {} with open(self.sourceFilepath, encoding=self.encoding ) as myFile: # Automatically closes the file when done for line in myFile: lineCount += 1 if lineCount == 1 and self.encoding.lower( ) == 'utf-8' and line[0] == chr(65279): #U+FEFF logging.info( " DrupalBible.load: Detected UTF-16 Byte Order Marker" ) line = line[1:] # Remove the UTF-8 Byte Order Marker if line[-1] == '\n': line = line[:-1] # Removing trailing newline character if not line: continue # Just discard blank lines #print ( 'DB file line is "' + line + '"' ) if line[0] == '#': continue # Just discard comment lines lastLine = line if lineCount == 1: if line != '*Bible': logging.warning( "Unknown DrupalBible first line: {}".format( repr(line))) elif status == 0: if line == '*Chapter': status = 1 else: # Get the version name details bits = line.split('|') shortName, fullName, language = bits self.name = fullName elif status == 1: if line == '*Context': status = 2 else: # Get the book name details bits = line.split('|') bookCode, bookFullName, bookShortName, numChapters = bits assert (bookShortName == bookCode) BBBresult = Globals.BibleBooksCodes.getBBBFromDrupalBibleCode( bookCode) BBB = BBBresult if isinstance( BBBresult, str ) else BBBresult[ 0] # Result can be string or list of strings (best guess first) bookDetails[ BBB] = bookFullName, bookShortName, numChapters elif status == 2: # Get the verse text bits = line.split('|') bookCode, chapterNumberString, verseNumberString, lineMark, verseText = bits #chapterNumber, verseNumber = int( chapterNumberString ), int( verseNumberString ) if lineMark: print(repr(lineMark)) halt BBBresult = Globals.BibleBooksCodes.getBBBFromDrupalBibleCode( bookCode) BBB = BBBresult if isinstance( BBBresult, str ) else BBBresult[ 0] # Result can be string or list of strings (best guess first) if BBB != lastBBB: if lastBBB is not None: self.saveBook(thisBook) thisBook = BibleBook(self, BBB) thisBook.objectNameString = "DrupalBible Bible Book object" thisBook.objectTypeString = "DrupalBible" lastChapterNumberString = None lastBBB = BBB if chapterNumberString != lastChapterNumberString: thisBook.appendLine('c', chapterNumberString) lastChapterNumberString = chapterNumberString verseText = verseText.replace('<', '\\it ').replace( '>', '\\it*') thisBook.appendLine('v', verseNumberString + ' ' + verseText) else: halt # Save the final book self.saveBook(thisBook) self.doPostLoadProcessing()
class USFXXMLBible( Bible ): """ Class to load and manipulate USFX Bibles. """ def __init__( self, sourceFolder, givenName=None, encoding='utf-8' ): """ Create the internal USFX Bible object. """ # Setup and initialise the base class first Bible.__init__( self ) self.objectNameString = "USFX XML Bible object" self.objectTypeString = "USFX" self.sourceFolder, self.givenName, self.encoding = sourceFolder, givenName, encoding # Remember our parameters # Now we can set our object variables self.name = self.givenName if not self.name: self.name = os.path.basename( self.sourceFolder ) if not self.name: self.name = os.path.basename( self.sourceFolder[:-1] ) # Remove the final slash if not self.name: self.name = "USFX Bible" if self.name.endswith( '_usfx' ): self.name = self.name[:-5] # Remove end of name for Haiola projects # Do a preliminary check on the readability of our folder if not os.access( self.sourceFolder, os.R_OK ): logging.error( "USFXXMLBible: Folder '{}' is unreadable".format( self.sourceFolder ) ) # Do a preliminary check on the contents of our folder self.sourceFilename = self.sourceFilepath = None foundFiles, foundFolders = [], [] for something in os.listdir( self.sourceFolder ): somepath = os.path.join( self.sourceFolder, something ) if os.path.isdir( somepath ): foundFolders.append( something ) elif os.path.isfile( somepath ): somethingUpper = something.upper() somethingUpperProper, somethingUpperExt = os.path.splitext( somethingUpper ) ignore = False for ending in filenameEndingsToIgnore: if somethingUpper.endswith( ending): ignore=True; break if ignore: continue if not somethingUpperExt[1:] in extensionsToIgnore: # Compare without the first dot foundFiles.append( something ) else: logging.error( "Not sure what '{}' is in {}!".format( somepath, self.sourceFolder ) ) if foundFolders: logging.info( "USFXXMLBible: Surprised to see subfolders in '{}': {}".format( self.sourceFolder, foundFolders ) ) if not foundFiles: if Globals.verbosityLevel > 0: print( "USFXXMLBible: Couldn't find any files in '{}'".format( self.sourceFolder ) ) return # No use continuing #print( self.sourceFolder, foundFolders, len(foundFiles), foundFiles ) numFound = 0 for thisFilename in sorted( foundFiles ): firstLines = Globals.peekIntoFile( thisFilename, sourceFolder, numLines=3 ) if not firstLines or len(firstLines)<2: continue if not firstLines[0].startswith( '<?xml version="1.0"' ) \ and not firstLines[0].startswith( '\ufeff<?xml version="1.0"' ): # same but with BOM if Globals.verbosityLevel > 2: print( "USFXB (unexpected) first line was '{}' in {}".format( firstLines, thisFilename ) ) continue if "<usfx " not in firstLines[0]: continue lastFilenameFound = thisFilename numFound += 1 if numFound: if Globals.verbosityLevel > 2: print( "USFXXMLBible got", numFound, sourceFolder, lastFilenameFound ) if numFound == 1: self.sourceFilename = lastFilenameFound self.sourceFilepath = os.path.join( self.sourceFolder, self.sourceFilename ) elif looksHopeful and Globals.verbosityLevel > 2: print( " Looked hopeful but no actual files found" ) # end of USFXXMLBible.__init_ def load( self ): """ Load the XML data file -- we should already know the filepath. """ if Globals.verbosityLevel > 1: print( _("USFXXMLBible: Loading {} from {}...").format( self.name, self.sourceFolder ) ) #if Globals.verbosityLevel > 2: print( _(" It seems we have {}...").format( BBB ) ) #self.thisBook = BibleBook( self.name, BBB ) #self.thisBook.objectNameString = "OSIS XML Bible Book object" #self.thisBook.objectTypeString = "OSIS" #self.haveBook = True try: self.tree = ElementTree().parse( self.sourceFilepath ) except ParseError: errorString = sys.exc_info()[1] logging.critical( "USFXXMLBible.load: failed loading the xml file {}: '{}'.".format( self.sourceFilepath, errorString ) ) return if Globals.debugFlag: assert( len ( self.tree ) ) # Fail here if we didn't load anything at all # Find the main (osis) container if self.tree.tag == 'usfx': location = "USFX file" Globals.checkXMLNoText( self.tree, location, '4f6h' ) Globals.checkXMLNoTail( self.tree, location, '1wk8' ) # Process the attributes first self.schemaLocation = None for attrib,value in self.tree.items(): #print( "attrib", repr(attrib), repr(value) ) if attrib.endswith("SchemaLocation"): self.schemaLocation = value else: logging.warning( "fv6g Unprocessed {} attribute ({}) in {}".format( attrib, value, location ) ) BBB = C = V = None for element in self.tree: #print( "element", repr(element.tag) ) sublocation = element.tag + " " + location if element.tag == 'languageCode': self.languageCode = element.text Globals.checkXMLNoTail( element, sublocation, 'cff3' ) Globals.checkXMLNoAttributes( element, sublocation, 'des1' ) Globals.checkXMLNoSubelements( element, sublocation, 'dwf2' ) elif element.tag == 'book': self.loadBook( element ) ##Globals.checkXMLNoSubelements( element, sublocation, '54f2' ) #Globals.checkXMLNoTail( element, sublocation, 'hd35' ) ## Process the attributes #idField = bookStyle = None #for attrib,value in element.items(): #if attrib=='id' or attrib=='code': #idField = value # Should be USFM bookcode (not like bookReferenceCode which is BibleOrgSys BBB bookcode) ##if idField != bookReferenceCode: ## logging.warning( _("Unexpected book code ({}) in {}").format( idField, sublocation ) ) #elif attrib=='style': #bookStyle = value #else: #logging.warning( _("gfw2 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) ) else: logging.warning( _("dbw1 Unprocessed {} element after {} {}:{} in {}").format( element.tag, BBB, C, V, sublocation ) ) #self.addPriorityError( 1, c, v, _("Unprocessed {} element").format( element.tag ) ) if not self.books: # Didn't successfully load any regularly named books -- maybe the files have weird names??? -- try to be intelligent here if Globals.verbosityLevel > 2: print( "USFXXMLBible.load: Didn't find any regularly named USFX files in '{}'".format( self.sourceFolder ) ) for thisFilename in foundFiles: # Look for BBB in the ID line (which should be the first line in a USFX file) isUSFX = False thisPath = os.path.join( self.sourceFolder, thisFilename ) with open( thisPath ) as possibleUSXFile: # Automatically closes the file when done for line in possibleUSXFile: if line.startswith( '\\id ' ): USXId = line[4:].strip()[:3] # Take the first three non-blank characters after the space after id if Globals.verbosityLevel > 2: print( "Have possible USFX ID '{}'".format( USXId ) ) BBB = Globals.BibleBooksCodes.getBBBFromUSFM( USXId ) if Globals.verbosityLevel > 2: print( "BBB is '{}'".format( BBB ) ) isUSFX = True break # We only look at the first line if isUSFX: UBB = USFXXMLBibleBook( self.name, BBB ) UBB.load( self.sourceFolder, thisFilename, self.encoding ) UBB.validateMarkers() print( UBB ) self.books[BBB] = UBB # Make up our book name dictionaries while we're at it assumedBookNames = UBB.getAssumedBookNames() for assumedBookName in assumedBookNames: self.BBBToNameDict[BBB] = assumedBookName assumedBookNameLower = assumedBookName.lower() self.bookNameDict[assumedBookNameLower] = BBB # Store the deduced book name (just lower case) self.combinedBookNameDict[assumedBookNameLower] = BBB # Store the deduced book name (just lower case) if ' ' in assumedBookNameLower: self.combinedBookNameDict[assumedBookNameLower.replace(' ','')] = BBB # Store the deduced book name (lower case without spaces) if self.books: print( "USFXXMLBible.load: Found {} irregularly named USFX files".format( len(self.books) ) ) self.doPostLoadProcessing() # end of USFXXMLBible.load def loadBook( self, bookElement ): """ Load the book container from the XML data file. """ if Globals.verbosityLevel > 3: print( _("USFXXMLBible.loadBook: Loading {} from {}...").format( self.name, self.sourceFolder ) ) assert( bookElement.tag == 'book' ) mainLocation = self.name + " USFX book" # Process the attributes first bookCode = None for attrib,value in bookElement.items(): if attrib == 'id': bookCode = value else: logging.warning( "bce3 Unprocessed {} attribute ({}) in {}".format( attrib, value, mainLocation ) ) BBB = Globals.BibleBooksCodes.getBBBFromUSFM( bookCode ) mainLocation = "{} USFX {} book".format( self.name, BBB ) if Globals.verbosityLevel > 2: print( _("USFXXMLBible.loadBook: Loading {} from {}...").format( BBB, self.name ) ) Globals.checkXMLNoText( self.tree, mainLocation, '4f6h' ) Globals.checkXMLNoTail( self.tree, mainLocation, '1wk8' ) # Now create our actual book self.thisBook = BibleBook( self.name, BBB ) self.thisBook.objectNameString = "USFX XML Bible Book object" self.thisBook.objectTypeString = "USFX" C = V = '0' for element in bookElement: #print( "element", repr(element.tag) ) location = "{} of {} {}:{}".format( element.tag, mainLocation, C, V ) if element.tag == 'id': idText = clean( element.text ) Globals.checkXMLNoTail( element, location, 'vsg3' ) Globals.checkXMLNoSubelements( element, location, 'ksq2' ) for attrib,value in element.items(): if attrib == 'id': assert( value == bookCode ) else: logging.warning( _("vsg4 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) self.thisBook.appendLine( 'id', bookCode + ((' '+idText) if idText else '') ) elif element.tag == 'ide': ideText = clean( element.text ) Globals.checkXMLNoTail( element, location, 'jsa0' ) Globals.checkXMLNoSubelements( element, location, 'ls01' ) charset = None for attrib,value in element.items(): if attrib == 'charset': charset = value else: logging.warning( _("jx53 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) self.thisBook.appendLine( 'ide', charset + ((' '+ideText) if ideText else '') ) elif element.tag == 'h': hText = element.text Globals.checkXMLNoTail( element, location, 'dj35' ) Globals.checkXMLNoAttributes( element, location, 'hs35' ) Globals.checkXMLNoSubelements( element, location, 'hs32' ) self.thisBook.appendLine( 'h', clean(hText) ) elif element.tag == 'toc': tocText = element.text Globals.checkXMLNoTail( element, location, 'ss13' ) Globals.checkXMLNoSubelements( element, location, 'js13' ) level = None for attrib,value in element.items(): if attrib == 'level': # Seems compulsory level = value else: logging.warning( _("dg36 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) self.thisBook.appendLine( 'toc'+level, clean(tocText) ) elif element.tag == 'c': Globals.checkXMLNoText( element, location, 'ks35' ) Globals.checkXMLNoTail( element, location, 'gs35' ) Globals.checkXMLNoSubelements( element, location, 'kdr3' ) # This is a milestone for attrib,value in element.items(): if attrib == 'id': C, V = value, '0' else: logging.warning( _("hj52 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) self.thisBook.appendLine( 'c', C ) elif element.tag == 's': sText = clean( element.text ) Globals.checkXMLNoTail( element, location, 'wxg0' ) level = None for attrib,value in element.items(): if attrib == 'level': # Seems optional level = value else: logging.warning( _("bdy6 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) marker = 's' if level: marker += level self.thisBook.appendLine( marker, sText ) for subelement in element: #print( "subelement", repr(subelement.tag) ) sublocation = subelement.tag + " of " + location if subelement.tag == 'f': self.loadFootnote( subelement, sublocation ) elif subelement.tag == 'x': self.loadCrossreference( subelement, sublocation ) elif subelement.tag == 'fig': self.loadFigure( subelement, sublocation ) elif subelement.tag == 'table': self.loadTable( subelement, sublocation ) elif subelement.tag in ('add','it','bd','bdit','sc',): self.loadCharacterFormatting( subelement, sublocation ) elif subelement.tag == 'optionalLineBreak': print( "What is loadBook optionalLineBreak?" ) else: logging.warning( _("jx9q Unprocessed {} element after {} {}:{} in {}").format( subelement.tag, BBB, C, V, sublocation ) ) elif element.tag in ('p','q','d',): V = self.loadParagraph( element, location, C ) elif element.tag == 'b': Globals.checkXMLNoText( element, location, 'ks35' ) Globals.checkXMLNoTail( element, location, 'gs35' ) Globals.checkXMLNoAttributes( element, location, 'nd04' ) Globals.checkXMLNoSubelements( element, location, 'kdr3' ) self.thisBook.appendLine( 'b', '' ) elif element.tag in ('cl','cp'): # Simple single-line paragraph-level markers marker, text = element.tag, clean(element.text) Globals.checkXMLNoTail( element, location, 'od01' ) Globals.checkXMLNoAttributes( element, location, 'us91' ) Globals.checkXMLNoSubelements( element, location, 'gd92' ) self.thisBook.appendLine( marker, text ) elif element.tag == 'table': self.loadTable( element, location ) else: logging.warning( _("caf2 Unprocessed {} element after {} {}:{} in {}").format( element.tag, BBB, C, V, location ) ) #self.addPriorityError( 1, c, v, _("Unprocessed {} element").format( element.tag ) ) if Globals.debugFlag: halt self.saveBook( self.thisBook ) # end of USFXXMLBible.loadBook def loadParagraph( self, paragraphElement, paragraphLocation, C ): """ Load the paragraph (p or q) container from the XML data file. """ #if Globals.verbosityLevel > 3: #print( _("USFXXMLBible.loadParagraph: Loading {} from {}...").format( self.name, self.sourceFolder ) ) V = None pText = paragraphElement.text Globals.checkXMLNoTail( paragraphElement, paragraphLocation, 'vsg7' ) # Process the attributes first sfm = level = style = None for attrib,value in paragraphElement.items(): if attrib == 'sfm': sfm = value elif attrib == 'level': level = value elif attrib == 'style': style = value else: logging.warning( "vfh4 Unprocessed {} attribute ({}) in {}".format( attrib, value, paragraphLocation ) ) for element in paragraphElement: location = element.tag + " of " + paragraphLocation #print( "element", repr(element.tag) ) if element.tag == 'v': # verse milestone vTail = clean( element.tail ) # Main verse text Globals.checkXMLNoText( element, location, 'crc2' ) Globals.checkXMLNoSubelements( element, location, 'lct3' ) lastV, V = V, None for attrib,value in element.items(): if attrib == 'id': V = value else: logging.warning( _("cbs2 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) assert( V is not None ) assert( V ) self.thisBook.appendLine( 'v', V + ((' '+vTail) if vTail else '' ) ) elif element.tag == 've': # verse end milestone -- we can just ignore this Globals.checkXMLNoText( element, location, 'lsc3' ) Globals.checkXMLNoTail( element, location, 'mfy4' ) Globals.checkXMLNoAttributes( element, location, 'bd24' ) Globals.checkXMLNoSubelements( element, location, 'ks35' ) elif element.tag == 'fig': self.loadFigure( element, location ) elif element.tag == 'table': self.loadTable( element, location ) elif element.tag == 'f': #print( "USFX.loadParagraph Found footnote at", paragraphLocation, C, V, repr(element.text) ) self.loadFootnote( element, location ) elif element.tag == 'x': #print( "USFX.loadParagraph Found xref at", paragraphLocation, C, V, repr(element.text) ) self.loadCrossreference( element, location ) elif element.tag in ('add','nd','wj','rq','sig','sls','bk','k','tl','vp','pn','qs','qt','em','it','bd','bdit','sc','no',): # character formatting self.loadCharacterFormatting( element, location ) elif element.tag == 'cs': # character style -- seems like a USFX hack text, tail = clean(element.text), clean(element.tail) Globals.checkXMLNoSubelements( element, location, 'kf92' ) sfm = None for attrib,value in element.items(): if attrib == 'sfm': sfm = value else: logging.warning( _("sh29 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) if sfm not in ('w','ior',): print( "cs sfm got", repr(sfm) ) self.thisBook.appendToLastLine( ' \\{} {}\\{}*{}'.format( sfm, text, sfm, (' '+tail) if tail else '' ) ) elif element.tag in ('cp',): # Simple single-line paragraph-level markers marker, text = element.tag, clean(element.text) Globals.checkXMLNoTail( element, location, 'kdf0' ) Globals.checkXMLNoAttributes( element, location, 'lkj1' ) Globals.checkXMLNoSubelements( element, location, 'da13' ) self.thisBook.appendLine( marker, text ) elif element.tag == 'ref': # encoded reference -- seems like a USFX hack text, tail = clean(element.text), clean(element.tail) Globals.checkXMLNoSubelements( element, location, 'bd83' ) target = None for attrib,value in element.items(): if attrib == 'tgt': target = value else: logging.warning( _("be83 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) #if target not in ('w','ior',): print( "ref sfm got", repr(sfm) ) self.thisBook.appendToLastLine( ' \\{} {}\\{}*{}{}'.format( element.tag, target, element.tag, text, (' '+tail) if tail else '' ) ) #print( "Saved", '\\{} {}\\{}*{}{}'.format( element.tag, target, element.tag, text, (' '+tail) if tail else '' ) ) elif element.tag == 'optionalLineBreak': print( "What is loadParagraph optionalLineBreak?" ) if Globals.debugFlag: halt elif element.tag == 'milestone': print( "What is loadParagraph milestone?" ) if Globals.debugFlag: halt else: logging.warning( _("df45 Unprocessed {} element after {} {}:{} in {}").format( repr(element.tag), self.thisBook.bookReferenceCode, C, V, location ) ) return V # end of USFXXMLBible.loadParagraph def loadCharacterFormatting( self, element, location ): """ """ marker, text, tail = element.tag, clean(element.text), clean(element.tail) Globals.checkXMLNoAttributes( element, location, 'sd12' ) self.thisBook.appendToLastLine( ' \\{} {}'.format( marker, text ) ) for subelement in element: sublocation = subelement.tag + " of " + location #print( "element", repr(element.tag) ) if subelement.tag == 'f': #print( "USFX.loadParagraph Found footnote at", sublocation, C, V, repr(subelement.text) ) self.loadFootnote( subelement, sublocation ) else: logging.warning( _("sf31 Unprocessed {} element after {} {}:{} in {}").format( repr(subelement.tag), self.thisBook.bookReferenceCode, C, V, location ) ) halt self.thisBook.appendToLastLine( '\\{}*{}'.format( marker, (' '+tail) if tail else '' ) ) # end of USFXXMLBible.loadCharacterFormatting def loadFigure( self, element, location ): """ """ Globals.checkXMLNoText( element, location, 'ff36' ) Globals.checkXMLNoAttributes( element, location, 'cf35' ) figDict = { 'description':'', 'catalog':'', 'size':'', 'location':'', 'copyright':'', 'caption':'', 'reference':'' } for subelement in element: sublocation = subelement.tag + " of " + location figTag, figText = subelement.tag, clean(subelement.text) assert( figTag in figDict ) figDict[figTag] = '' if figText is None else figText Globals.checkXMLNoTail( subelement, sublocation, 'jkf5' ) Globals.checkXMLNoAttributes( subelement, sublocation, 'ld18' ) Globals.checkXMLNoSubelements( subelement, sublocation, 'hb46' ) newString = '' for j,tag in enumerate( ('description', 'catalog', 'size', 'location', 'copyright', 'caption', 'reference',) ): newString += ('' if j==0 else '|') + figDict[tag] figTail = clean( element.tail ) self.thisBook.appendToLastLine( ' \\fig {}\\fig*{}'.format( newString, (' '+figTail) if figTail else '' ) ) # end of USFXXMLBible.loadFigure def loadTable( self, element, location ): """ """ Globals.checkXMLNoText( element, location, 'kg92' ) Globals.checkXMLNoTail( element, location, 'ka92' ) Globals.checkXMLNoAttributes( element, location, 'ks63' ) for subelement in element: sublocation = subelement.tag + " of " + location if subelement.tag == 'tr': #print( "table", sublocation ) self.thisBook.appendLine( 'tr', '' ) Globals.checkXMLNoText( subelement, sublocation, 'sg32' ) Globals.checkXMLNoTail( subelement, sublocation, 'dh82' ) Globals.checkXMLNoAttributes( subelement, sublocation, 'mniq' ) for sub2element in subelement: sub2location = sub2element.tag + " of " + sublocation tag, text = sub2element.tag, clean(sub2element.text) assert( tag in ('th', 'thr', 'tc', 'tcr',) ) Globals.checkXMLNoTail( sub2element, sub2location, 'ah82' ) Globals.checkXMLNoSubelements( sub2element, sub2location, 'ka63' ) level = None for attrib,value in sub2element.items(): if attrib == 'level': level = value else: logging.warning( _("vx25 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) marker = tag + (level if level else '') self.thisBook.appendToLastLine( ' \\{} {}'.format( marker, text ) ) else: logging.warning( _("kv64 Unprocessed {} element after {} {}:{} in {}").format( subelement.tag, self.thisBook.bookReferenceCode, C, V, sublocation ) ) # end of USFXXMLBible.loadTable def loadFootnote( self, element, location ): """ """ text, tail = clean(element.text), clean(element.tail) caller = None for attrib,value in element.items(): if attrib == 'caller': caller = value else: logging.warning( _("dg35 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) self.thisBook.appendToLastLine( ' \\f {}{}'.format( caller, (' '+text) if text else '' ) ) for subelement in element: sublocation = subelement.tag + " of " + location marker, fText, fTail = subelement.tag, clean(subelement.text), clean(subelement.tail) #print( "USFX.loadFootnote", repr(caller), repr(text), repr(tail), repr(marker), repr(fText), repr(fTail) ) #if Globals.verbosityLevel > 0 and marker not in ('ref','fr','ft','fq','fv','fk','fqa','it','bd','rq',): #print( "USFX.loadFootnote found", repr(caller), repr(marker), repr(fText), repr(fTail) ) if Globals.debugFlag: assert( marker in ('ref','fr','ft','fq','fv','fk','fqa','it','bd','rq',) ) if marker=='ref': assert( fText ) Globals.checkXMLNoSubelements( subelement, sublocation, 'ls13' ) target = None for attrib,value in subelement.items(): if attrib == 'tgt': target = value else: logging.warning( _("gs35 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) ) if target: self.thisBook.appendToLastLine( ' \\{} {}\\{}*{}'.format( marker, target, marker, fText ) ) else: halt else: Globals.checkXMLNoAttributes( subelement, sublocation, 'dq54' ) self.thisBook.appendToLastLine( ' \\{} {}'.format( marker, fText ) ) if marker[0] == 'f': # Starts with f, e.g., fr, ft for sub2element in subelement: sub2location = sub2element.tag + " of " + sublocation marker2, fText2, fTail2 = sub2element.tag, clean(sub2element.text), clean(sub2element.tail) Globals.checkXMLNoSubelements( sub2element, sub2location, 'js72' ) if marker2=='ref': print( sub2location ) assert( not fText2 ) target = None for attrib,value in sub2element.items(): if attrib == 'tgt': target = value else: logging.warning( _("hd52 Unprocessed {} attribute ({}) in {}").format( attrib, value, sub2location ) ) if target: self.thisBook.appendToLastLine( ' \\{} {}'.format( marker2, target ) ) else: halt else: halt else: halt if fTail: self.thisBook.appendToLastLine( '\\{}*{}'.format( marker, fTail ) ) self.thisBook.appendToLastLine( '\\f*{}'.format( (' '+tail) if tail else '' ) ) # end of USFXXMLBible.loadFootnote def loadCrossreference( self, element, location ): """ Has to handle: <x caller="+"><ref tgt="EXO.30.12">Exodus 30:12</ref></x> """ text, tail = clean(element.text), clean(element.tail) caller = None for attrib,value in element.items(): if attrib == 'caller': caller = value else: logging.warning( _("fhj2 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) self.thisBook.appendToLastLine( ' \\x {}'.format( caller ) ) for subelement in element: sublocation = subelement.tag + " of " + location marker, xText, xTail = subelement.tag, clean(subelement.text), clean(subelement.tail) print( "USFX.loadCrossreference", repr(caller), repr(text), repr(tail), repr(marker), repr(xText), repr(xTail) ) #if Globals.verbosityLevel > 0 and marker not in ('ref','xo','xt',): #print( "USFX.loadCrossreference found", repr(caller), repr(marker), repr(xText), repr(xTail) ) if Globals.debugFlag: assert( marker in ('ref','xo','xt',) ) if marker=='ref': assert( xText ) Globals.checkXMLNoSubelements( subelement, sublocation, 's1sd' ) target = None for attrib,value in subelement.items(): if attrib == 'tgt': target = value else: logging.warning( _("aj41 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) ) if target: self.thisBook.appendToLastLine( ' \\{} {}\\{}*{}'.format( marker, target, marker, xText ) ) else: halt else: Globals.checkXMLNoAttributes( subelement, sublocation, 'sc35' ) self.thisBook.appendToLastLine( ' \\{} {}'.format( marker, xText ) ) if marker[0] == 'x': # Starts with x, e.g., xo, xt for sub2element in subelement: sub2location = sub2element.tag + " of " + sublocation marker2, xText2, xTail2 = sub2element.tag, clean(sub2element.text), clean(sub2element.tail) Globals.checkXMLNoSubelements( sub2element, sub2location, 'fs63' ) if marker2=='ref': assert( not xText2 ) target = None for attrib,value in sub2element.items(): if attrib == 'tgt': target = value else: logging.warning( _("gs34 Unprocessed {} attribute ({}) in {}").format( attrib, value, sub2location ) ) if target: self.thisBook.appendToLastLine( ' \\{} {}'.format( marker2, target ) ) else: halt else: halt else: halt if xTail: self.thisBook.appendToLastLine( '\\{}*{}'.format( marker, xTail ) ) self.thisBook.appendToLastLine( '\\x*{}'.format( (' '+tail) if tail else '' ) )
def load( self ): """ Load a single source file and load book elements. """ if Globals.verbosityLevel > 2: print( _("Loading {}...").format( self.sourceFilepath ) ) lastLine, lineCount = '', 0 BBB = None NRSVA_bookCode = NRSVA_chapterNumberString = NRSVA_verseNumberString = None subverseNumberString = sequenceNumberString = None lastBookCode = lastChapterNumber = lastVerseNumber = lastSequence = -1 lastVText = '' with open( self.sourceFilepath, encoding=self.encoding ) as myFile: # Automatically closes the file when done for line in myFile: lineCount += 1 #if lineCount==1 and self.encoding.lower()=='utf-8' and line[0]==chr(65279): #U+FEFF #logging.info( " UnboundBible.load: Detected UTF-16 Byte Order Marker" ) #line = line[1:] # Remove the UTF-8 Byte Order Marker if line[-1]=='\n': line=line[:-1] # Removing trailing newline character if not line: continue # Just discard blank lines lastLine = line #print ( 'UB file line is "' + line + '"' ) if line[0]=='#': hashBits = line[1:].split( '\t' ) if len(hashBits)==2 and hashBits[1]: # We have some valid meta-data if hashBits[0] == 'name': self.name = hashBits[1] elif hashBits[0] == 'filetype': self.filetype = hashBits[1] elif hashBits[0] == 'copyright': self.copyright = hashBits[1] elif hashBits[0] == 'abbreviation': self.abbreviation = hashBits[1] elif hashBits[0] == 'language': self.language = hashBits[1] elif hashBits[0] == 'note': self.note = hashBits[1] elif hashBits[0] == 'columns': self.columns = hashBits[1] # Should some of these be placed into self.settingsDict??? logging.warning( "Unknown UnboundBible meta-data field '{}' = '{}'".format( hashBits[0], hashBits[1] ) ) continue # Just discard comment lines bits = line.split( '\t' ) #print( self.givenName, BBB, bits ) if len(bits) == 4: bookCode, chapterNumberString, verseNumberString, vText = bits elif len(bits) == 6: bookCode, chapterNumberString, verseNumberString, subverseNumberString, sequenceNumberString, vText = bits elif len(bits) == 9: NRSVA_bookCode, NRSVA_chapterNumberString, NRSVA_verseNumberString, bookCode, chapterNumberString, verseNumberString, subverseNumberString, sequenceNumberString, vText = bits elif len(bits) == 1 and self.givenName.startswith( 'lxx_a_parsing_' ): logging.warning( _("Skipping bad '{}' line in {} {} {} {}:{}").format( line, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) continue else: print( "Unexpected number of bits", self.givenName, BBB, bookCode, chapterNumberString, verseNumberString, len(bits), bits ); halt if NRSVA_bookCode: assert( len(NRSVA_bookCode) == 3 ) if NRSVA_chapterNumberString: assert( NRSVA_chapterNumberString.isdigit() ) if NRSVA_verseNumberString: assert( NRSVA_verseNumberString.isdigit() ) if not bookCode and not chapterNumberString and not verseNumberString: print( "Skipping empty line in {} {} {} {}:{}".format( self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) continue if Globals.debugFlag: assert( len(bookCode) == 3 ) if Globals.debugFlag: assert( chapterNumberString.isdigit() ) if Globals.debugFlag: assert( verseNumberString.isdigit() ) if subverseNumberString: logging.warning( _("subverseNumberString '{}' in {} {} {}:{}").format( subverseNumberString, BBB, bookCode, chapterNumberString, verseNumberString ) ) vText = vText.strip() # Remove leading and trailing spaces if not vText: continue # Just ignore blank verses I think if vText == '+': continue # Not sure what this means in basic_english JHN 1:38 chapterNumber = int( chapterNumberString ) verseNumber = int( verseNumberString ) if sequenceNumberString: if Globals.debugFlag: assert( sequenceNumberString.isdigit() ) sequenceNumber = int( sequenceNumberString ) if Globals.debugFlag: assert( sequenceNumber > lastSequence or \ self.givenName in ('gothic_latin', 'hebrew_bhs_consonants', 'hebrew_bhs_vowels', 'latvian_nt', 'ukrainian_1871',) ) # Why??? lastSequence = sequenceNumber if bookCode != lastBookCode: # We've started a new book if lastBookCode != -1: # Better save the last book self.saveBook( thisBook ) BBB = Globals.BibleBooksCodes.getBBBFromUnboundBibleCode( bookCode ) thisBook = BibleBook( self, BBB ) thisBook.objectNameString = "Unbound Bible Book object" thisBook.objectTypeString = "Unbound" lastBookCode = bookCode lastChapterNumber = lastVerseNumber = -1 if chapterNumber != lastChapterNumber: # We've started a new chapter if Globals.debugFlag: assert( chapterNumber > lastChapterNumber or BBB=='ESG' ) # Esther Greek might be an exception if chapterNumber == 0: logging.info( "Have chapter zero in {} {} {} {}:{}".format( self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) thisBook.appendLine( 'c', chapterNumberString ) lastChapterNumber = chapterNumber lastVerseNumber = -1 # Handle the verse info if verseNumber==lastVerseNumber and vText==lastVText: logging.warning( _("Ignored duplicate verse line in {} {} {} {}:{}").format( self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) continue if BBB=='PSA' and verseNumberString=='1' and vText.startswith('<') and self.givenName=='basic_english': # Move Psalm titles to verse zero verseNumber = 0 if verseNumber < lastVerseNumber: logging.warning( _("Ignored receding verse number (from {} to {}) in {} {} {} {}:{}").format( lastVerseNumber, verseNumber, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) elif verseNumber == lastVerseNumber: if vText == lastVText: logging.warning( _("Ignored duplicated {} verse in {} {} {} {}:{}").format( verseNumber, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) else: logging.warning( _("Ignored duplicated {} verse number in {} {} {} {}:{}").format( verseNumber, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) thisBook.appendLine( 'v', verseNumberString + ' ' + vText ) lastVText = vText lastVerseNumber = verseNumber # Save the final book self.saveBook( thisBook ) self.doPostLoadProcessing()
class USFXXMLBible( Bible ): """ Class to load and manipulate USFX Bibles. """ def __init__( self, sourceFolder, givenName=None, encoding='utf-8' ): """ Create the internal USFX Bible object. """ # Setup and initialise the base class first Bible.__init__( self ) self.objectNameString = "USFX XML Bible object" self.objectTypeString = "USFX" self.sourceFolder, self.givenName, self.encoding = sourceFolder, givenName, encoding # Remember our parameters # Now we can set our object variables self.name = self.givenName if not self.name: self.name = os.path.basename( self.sourceFolder ) if not self.name: self.name = os.path.basename( self.sourceFolder[:-1] ) # Remove the final slash if not self.name: self.name = "USFX Bible" if self.name.endswith( '_usfx' ): self.name = self.name[:-5] # Remove end of name for Haiola projects # Do a preliminary check on the readability of our folder if not os.access( self.sourceFolder, os.R_OK ): logging.error( "USFXXMLBible: Folder '{}' is unreadable".format( self.sourceFolder ) ) # Do a preliminary check on the contents of our folder self.sourceFilename = self.sourceFilepath = None foundFiles, foundFolders = [], [] for something in os.listdir( self.sourceFolder ): somepath = os.path.join( self.sourceFolder, something ) if os.path.isdir( somepath ): foundFolders.append( something ) elif os.path.isfile( somepath ): somethingUpper = something.upper() somethingUpperProper, somethingUpperExt = os.path.splitext( somethingUpper ) ignore = False for ending in filenameEndingsToIgnore: if somethingUpper.endswith( ending): ignore=True; break if ignore: continue if not somethingUpperExt[1:] in extensionsToIgnore: # Compare without the first dot foundFiles.append( something ) else: logging.error( "Not sure what '{}' is in {}!".format( somepath, self.sourceFolder ) ) if foundFolders: logging.info( "USFXXMLBible: Surprised to see subfolders in '{}': {}".format( self.sourceFolder, foundFolders ) ) if not foundFiles: if Globals.verbosityLevel > 0: print( "USFXXMLBible: Couldn't find any files in '{}'".format( self.sourceFolder ) ) return # No use continuing #print( self.sourceFolder, foundFolders, len(foundFiles), foundFiles ) numFound = 0 for thisFilename in sorted( foundFiles ): firstLines = Globals.peekIntoFile( thisFilename, sourceFolder, numLines=3 ) if not firstLines or len(firstLines)<2: continue if not firstLines[0].startswith( '<?xml version="1.0"' ) \ and not firstLines[0].startswith( '\ufeff<?xml version="1.0"' ): # same but with BOM if Globals.verbosityLevel > 2: print( "USFXB (unexpected) first line was '{}' in {}".format( firstLines, thisFilename ) ) continue if "<usfx " not in firstLines[0]: continue lastFilenameFound = thisFilename numFound += 1 if numFound: if Globals.verbosityLevel > 2: print( "USFXXMLBible got", numFound, sourceFolder, lastFilenameFound ) if numFound == 1: self.sourceFilename = lastFilenameFound self.sourceFilepath = os.path.join( self.sourceFolder, self.sourceFilename ) elif looksHopeful and Globals.verbosityLevel > 2: print( " Looked hopeful but no actual files found" ) # end of USFXXMLBible.__init_ def load( self ): """ Load the XML data file -- we should already know the filepath. """ if Globals.verbosityLevel > 1: print( _("USFXXMLBible: Loading {} from {}...").format( self.name, self.sourceFolder ) ) #if Globals.verbosityLevel > 2: print( _(" It seems we have {}...").format( BBB ) ) #self.thisBook = BibleBook( self, BBB ) #self.thisBook.objectNameString = "OSIS XML Bible Book object" #self.thisBook.objectTypeString = "OSIS" #self.haveBook = True try: self.tree = ElementTree().parse( self.sourceFilepath ) except ParseError: errorString = sys.exc_info()[1] logging.critical( "USFXXMLBible.load: failed loading the xml file {}: '{}'.".format( self.sourceFilepath, errorString ) ) return if Globals.debugFlag: assert( len ( self.tree ) ) # Fail here if we didn't load anything at all # Find the main (osis) container if self.tree.tag == 'usfx': location = "USFX file" Globals.checkXMLNoText( self.tree, location, '4f6h' ) Globals.checkXMLNoTail( self.tree, location, '1wk8' ) # Process the attributes first self.schemaLocation = None for attrib,value in self.tree.items(): #print( "attrib", repr(attrib), repr(value) ) if attrib.endswith("SchemaLocation"): self.schemaLocation = value else: logging.warning( "fv6g Unprocessed {} attribute ({}) in {}".format( attrib, value, location ) ) BBB = C = V = None for element in self.tree: #print( "element", repr(element.tag) ) sublocation = element.tag + " " + location if element.tag == 'languageCode': self.languageCode = element.text Globals.checkXMLNoTail( element, sublocation, 'cff3' ) Globals.checkXMLNoAttributes( element, sublocation, 'des1' ) Globals.checkXMLNoSubelements( element, sublocation, 'dwf2' ) elif element.tag == 'book': self.loadBook( element ) ##Globals.checkXMLNoSubelements( element, sublocation, '54f2' ) #Globals.checkXMLNoTail( element, sublocation, 'hd35' ) ## Process the attributes #idField = bookStyle = None #for attrib,value in element.items(): #if attrib=='id' or attrib=='code': #idField = value # Should be USFM bookcode (not like BBB which is BibleOrgSys BBB bookcode) ##if idField != BBB: ## logging.warning( _("Unexpected book code ({}) in {}").format( idField, sublocation ) ) #elif attrib=='style': #bookStyle = value #else: #logging.warning( _("gfw2 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) ) else: logging.warning( _("dbw1 Unprocessed {} element after {} {}:{} in {}").format( element.tag, BBB, C, V, sublocation ) ) #self.addPriorityError( 1, c, v, _("Unprocessed {} element").format( element.tag ) ) if not self.books: # Didn't successfully load any regularly named books -- maybe the files have weird names??? -- try to be intelligent here if Globals.verbosityLevel > 2: print( "USFXXMLBible.load: Didn't find any regularly named USFX files in '{}'".format( self.sourceFolder ) ) for thisFilename in foundFiles: # Look for BBB in the ID line (which should be the first line in a USFX file) isUSFX = False thisPath = os.path.join( self.sourceFolder, thisFilename ) with open( thisPath ) as possibleUSXFile: # Automatically closes the file when done for line in possibleUSXFile: if line.startswith( '\\id ' ): USXId = line[4:].strip()[:3] # Take the first three non-blank characters after the space after id if Globals.verbosityLevel > 2: print( "Have possible USFX ID '{}'".format( USXId ) ) BBB = Globals.BibleBooksCodes.getBBBFromUSFM( USXId ) if Globals.verbosityLevel > 2: print( "BBB is '{}'".format( BBB ) ) isUSFX = True break # We only look at the first line if isUSFX: UBB = USFXXMLBibleBook( self, BBB ) UBB.load( self.sourceFolder, thisFilename, self.encoding ) UBB.validateMarkers() print( UBB ) self.books[BBB] = UBB # Make up our book name dictionaries while we're at it assumedBookNames = UBB.getAssumedBookNames() for assumedBookName in assumedBookNames: self.BBBToNameDict[BBB] = assumedBookName assumedBookNameLower = assumedBookName.lower() self.bookNameDict[assumedBookNameLower] = BBB # Store the deduced book name (just lower case) self.combinedBookNameDict[assumedBookNameLower] = BBB # Store the deduced book name (just lower case) if ' ' in assumedBookNameLower: self.combinedBookNameDict[assumedBookNameLower.replace(' ','')] = BBB # Store the deduced book name (lower case without spaces) if self.books: print( "USFXXMLBible.load: Found {} irregularly named USFX files".format( len(self.books) ) ) self.doPostLoadProcessing() # end of USFXXMLBible.load def loadBook( self, bookElement ): """ Load the book container from the XML data file. """ if Globals.verbosityLevel > 3: print( _("USFXXMLBible.loadBook: Loading {} from {}...").format( self.name, self.sourceFolder ) ) assert( bookElement.tag == 'book' ) mainLocation = self.name + " USFX book" # Process the attributes first bookCode = None for attrib,value in bookElement.items(): if attrib == 'id': bookCode = value else: logging.warning( "bce3 Unprocessed {} attribute ({}) in {}".format( attrib, value, mainLocation ) ) BBB = Globals.BibleBooksCodes.getBBBFromUSFM( bookCode ) mainLocation = "{} USFX {} book".format( self.name, BBB ) if Globals.verbosityLevel > 2: print( _("USFXXMLBible.loadBook: Loading {} from {}...").format( BBB, self.name ) ) Globals.checkXMLNoText( self.tree, mainLocation, '4f6h' ) Globals.checkXMLNoTail( self.tree, mainLocation, '1wk8' ) # Now create our actual book self.thisBook = BibleBook( self, BBB ) self.thisBook.objectNameString = "USFX XML Bible Book object" self.thisBook.objectTypeString = "USFX" C = V = '0' for element in bookElement: #print( "element", repr(element.tag) ) location = "{} of {} {}:{}".format( element.tag, mainLocation, C, V ) if element.tag == 'id': idText = clean( element.text ) Globals.checkXMLNoTail( element, location, 'vsg3' ) Globals.checkXMLNoSubelements( element, location, 'ksq2' ) for attrib,value in element.items(): if attrib == 'id': assert( value == bookCode ) else: logging.warning( _("vsg4 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) self.thisBook.appendLine( 'id', bookCode + ((' '+idText) if idText else '') ) elif element.tag == 'ide': ideText = clean( element.text ) Globals.checkXMLNoTail( element, location, 'jsa0' ) Globals.checkXMLNoSubelements( element, location, 'ls01' ) charset = None for attrib,value in element.items(): if attrib == 'charset': charset = value else: logging.warning( _("jx53 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) self.thisBook.appendLine( 'ide', charset + ((' '+ideText) if ideText else '') ) elif element.tag == 'h': hText = element.text Globals.checkXMLNoTail( element, location, 'dj35' ) Globals.checkXMLNoAttributes( element, location, 'hs35' ) Globals.checkXMLNoSubelements( element, location, 'hs32' ) self.thisBook.appendLine( 'h', clean(hText) ) elif element.tag == 'toc': tocText = element.text Globals.checkXMLNoTail( element, location, 'ss13' ) Globals.checkXMLNoSubelements( element, location, 'js13' ) level = None for attrib,value in element.items(): if attrib == 'level': # Seems compulsory level = value else: logging.warning( _("dg36 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) self.thisBook.appendLine( 'toc'+level, clean(tocText) ) elif element.tag == 'c': Globals.checkXMLNoText( element, location, 'ks35' ) Globals.checkXMLNoTail( element, location, 'gs35' ) Globals.checkXMLNoSubelements( element, location, 'kdr3' ) # This is a milestone for attrib,value in element.items(): if attrib == 'id': C, V = value, '0' else: logging.warning( _("hj52 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) self.thisBook.appendLine( 'c', C ) elif element.tag == 's': sText = clean( element.text ) Globals.checkXMLNoTail( element, location, 'wxg0' ) level = None for attrib,value in element.items(): if attrib == 'level': # Seems optional level = value else: logging.warning( _("bdy6 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) marker = 's' if level: marker += level self.thisBook.appendLine( marker, sText ) for subelement in element: #print( "subelement", repr(subelement.tag) ) sublocation = subelement.tag + " of " + location if subelement.tag == 'f': self.loadFootnote( subelement, sublocation ) elif subelement.tag == 'x': self.loadCrossreference( subelement, sublocation ) elif subelement.tag == 'fig': self.loadFigure( subelement, sublocation ) elif subelement.tag == 'table': self.loadTable( subelement, sublocation ) elif subelement.tag in ('add','it','bd','bdit','sc',): self.loadCharacterFormatting( subelement, sublocation ) elif subelement.tag == 'optionalLineBreak': print( "What is loadBook optionalLineBreak?" ) else: logging.warning( _("jx9q Unprocessed {} element after {} {}:{} in {}").format( subelement.tag, BBB, C, V, sublocation ) ) elif element.tag in ('p','q','d',): V = self.loadParagraph( element, location, C ) elif element.tag == 'b': Globals.checkXMLNoText( element, location, 'ks35' ) Globals.checkXMLNoTail( element, location, 'gs35' ) Globals.checkXMLNoAttributes( element, location, 'nd04' ) Globals.checkXMLNoSubelements( element, location, 'kdr3' ) self.thisBook.appendLine( 'b', '' ) elif element.tag in ('cl','cp'): # Simple single-line paragraph-level markers marker, text = element.tag, clean(element.text) Globals.checkXMLNoTail( element, location, 'od01' ) Globals.checkXMLNoAttributes( element, location, 'us91' ) Globals.checkXMLNoSubelements( element, location, 'gd92' ) self.thisBook.appendLine( marker, text ) elif element.tag == 'table': self.loadTable( element, location ) else: logging.critical( _("caf2 Unprocessed {} element after {} {}:{} in {}").format( element.tag, BBB, C, V, location ) ) #self.addPriorityError( 1, c, v, _("Unprocessed {} element").format( element.tag ) ) if Globals.debugFlag: halt self.saveBook( self.thisBook ) # end of USFXXMLBible.loadBook def loadParagraph( self, paragraphElement, paragraphLocation, C ): """ Load the paragraph (p or q) container from the XML data file. """ #if Globals.verbosityLevel > 3: #print( _("USFXXMLBible.loadParagraph: Loading {} from {}...").format( self.name, self.sourceFolder ) ) V = None pText = paragraphElement.text Globals.checkXMLNoTail( paragraphElement, paragraphLocation, 'vsg7' ) # Process the attributes first sfm = level = style = None for attrib,value in paragraphElement.items(): if attrib == 'sfm': sfm = value elif attrib == 'level': level = value elif attrib == 'style': style = value else: logging.warning( "vfh4 Unprocessed {} attribute ({}) in {}".format( attrib, value, paragraphLocation ) ) for element in paragraphElement: location = element.tag + " of " + paragraphLocation #print( "element", repr(element.tag) ) if element.tag == 'v': # verse milestone vTail = clean( element.tail ) # Main verse text Globals.checkXMLNoText( element, location, 'crc2' ) Globals.checkXMLNoSubelements( element, location, 'lct3' ) lastV, V = V, None for attrib,value in element.items(): if attrib == 'id': V = value else: logging.warning( _("cbs2 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) assert( V is not None ) assert( V ) self.thisBook.appendLine( 'v', V + ((' '+vTail) if vTail else '' ) ) elif element.tag == 've': # verse end milestone -- we can just ignore this Globals.checkXMLNoText( element, location, 'lsc3' ) Globals.checkXMLNoTail( element, location, 'mfy4' ) Globals.checkXMLNoAttributes( element, location, 'bd24' ) Globals.checkXMLNoSubelements( element, location, 'ks35' ) elif element.tag == 'fig': self.loadFigure( element, location ) elif element.tag == 'table': self.loadTable( element, location ) elif element.tag == 'f': #print( "USFX.loadParagraph Found footnote at", paragraphLocation, C, V, repr(element.text) ) self.loadFootnote( element, location ) elif element.tag == 'x': #print( "USFX.loadParagraph Found xref at", paragraphLocation, C, V, repr(element.text) ) self.loadCrossreference( element, location ) elif element.tag in ('add','nd','wj','rq','sig','sls','bk','k','tl','vp','pn','qs','qt','em','it','bd','bdit','sc','no',): # character formatting self.loadCharacterFormatting( element, location ) elif element.tag == 'cs': # character style -- seems like a USFX hack text, tail = clean(element.text), clean(element.tail) Globals.checkXMLNoSubelements( element, location, 'kf92' ) sfm = None for attrib,value in element.items(): if attrib == 'sfm': sfm = value else: logging.warning( _("sh29 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) if sfm not in ('w','ior',): print( "cs sfm got", repr(sfm) ) self.thisBook.appendToLastLine( ' \\{} {}\\{}*{}'.format( sfm, text, sfm, (' '+tail) if tail else '' ) ) elif element.tag in ('cp',): # Simple single-line paragraph-level markers marker, text = element.tag, clean(element.text) Globals.checkXMLNoTail( element, location, 'kdf0' ) Globals.checkXMLNoAttributes( element, location, 'lkj1' ) Globals.checkXMLNoSubelements( element, location, 'da13' ) self.thisBook.appendLine( marker, text ) elif element.tag == 'ref': # encoded reference -- seems like a USFX hack text, tail = clean(element.text), clean(element.tail) Globals.checkXMLNoSubelements( element, location, 'bd83' ) target = None for attrib,value in element.items(): if attrib == 'tgt': target = value else: logging.warning( _("be83 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) #if target not in ('w','ior',): print( "ref sfm got", repr(sfm) ) self.thisBook.appendToLastLine( ' \\{} {}\\{}*{}{}'.format( element.tag, target, element.tag, text, (' '+tail) if tail else '' ) ) #print( "Saved", '\\{} {}\\{}*{}{}'.format( element.tag, target, element.tag, text, (' '+tail) if tail else '' ) ) elif element.tag == 'optionalLineBreak': print( "What is loadParagraph optionalLineBreak?" ) if Globals.debugFlag: halt elif element.tag == 'milestone': print( "What is loadParagraph milestone?" ) if Globals.debugFlag: halt else: logging.warning( _("df45 Unprocessed {} element after {} {}:{} in {}").format( repr(element.tag), self.thisBook.BBB, C, V, location ) ) return V # end of USFXXMLBible.loadParagraph def loadCharacterFormatting( self, element, location ): """ """ marker, text, tail = element.tag, clean(element.text), clean(element.tail) Globals.checkXMLNoAttributes( element, location, 'sd12' ) self.thisBook.appendToLastLine( ' \\{} {}'.format( marker, text ) ) for subelement in element: sublocation = subelement.tag + " of " + location #print( "element", repr(element.tag) ) if subelement.tag == 'f': #print( "USFX.loadParagraph Found footnote at", sublocation, C, V, repr(subelement.text) ) self.loadFootnote( subelement, sublocation ) else: logging.warning( _("sf31 Unprocessed {} element after {} {}:{} in {}").format( repr(subelement.tag), self.thisBook.BBB, C, V, location ) ) halt self.thisBook.appendToLastLine( '\\{}*{}'.format( marker, (' '+tail) if tail else '' ) ) # end of USFXXMLBible.loadCharacterFormatting def loadFigure( self, element, location ): """ """ Globals.checkXMLNoText( element, location, 'ff36' ) Globals.checkXMLNoAttributes( element, location, 'cf35' ) figDict = { 'description':'', 'catalog':'', 'size':'', 'location':'', 'copyright':'', 'caption':'', 'reference':'' } for subelement in element: sublocation = subelement.tag + " of " + location figTag, figText = subelement.tag, clean(subelement.text) assert( figTag in figDict ) figDict[figTag] = '' if figText is None else figText Globals.checkXMLNoTail( subelement, sublocation, 'jkf5' ) Globals.checkXMLNoAttributes( subelement, sublocation, 'ld18' ) Globals.checkXMLNoSubelements( subelement, sublocation, 'hb46' ) newString = '' for j,tag in enumerate( ('description', 'catalog', 'size', 'location', 'copyright', 'caption', 'reference',) ): newString += ('' if j==0 else '|') + figDict[tag] figTail = clean( element.tail ) self.thisBook.appendToLastLine( ' \\fig {}\\fig*{}'.format( newString, (' '+figTail) if figTail else '' ) ) # end of USFXXMLBible.loadFigure def loadTable( self, element, location ): """ """ Globals.checkXMLNoText( element, location, 'kg92' ) Globals.checkXMLNoTail( element, location, 'ka92' ) Globals.checkXMLNoAttributes( element, location, 'ks63' ) for subelement in element: sublocation = subelement.tag + " of " + location if subelement.tag == 'tr': #print( "table", sublocation ) self.thisBook.appendLine( 'tr', '' ) Globals.checkXMLNoText( subelement, sublocation, 'sg32' ) Globals.checkXMLNoTail( subelement, sublocation, 'dh82' ) Globals.checkXMLNoAttributes( subelement, sublocation, 'mniq' ) for sub2element in subelement: sub2location = sub2element.tag + " of " + sublocation tag, text = sub2element.tag, clean(sub2element.text) assert( tag in ('th', 'thr', 'tc', 'tcr',) ) Globals.checkXMLNoTail( sub2element, sub2location, 'ah82' ) Globals.checkXMLNoSubelements( sub2element, sub2location, 'ka63' ) level = None for attrib,value in sub2element.items(): if attrib == 'level': level = value else: logging.warning( _("vx25 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) marker = tag + (level if level else '') self.thisBook.appendToLastLine( ' \\{} {}'.format( marker, text ) ) else: logging.warning( _("kv64 Unprocessed {} element after {} {}:{} in {}").format( subelement.tag, self.thisBook.BBB, C, V, sublocation ) ) # end of USFXXMLBible.loadTable def loadFootnote( self, element, location ): """ """ text, tail = clean(element.text), clean(element.tail) caller = None for attrib,value in element.items(): if attrib == 'caller': caller = value else: logging.warning( _("dg35 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) self.thisBook.appendToLastLine( ' \\f {}{}'.format( caller, (' '+text) if text else '' ) ) for subelement in element: sublocation = subelement.tag + " of " + location marker, fText, fTail = subelement.tag, clean(subelement.text), clean(subelement.tail) #print( "USFX.loadFootnote", repr(caller), repr(text), repr(tail), repr(marker), repr(fText), repr(fTail) ) #if Globals.verbosityLevel > 0 and marker not in ('ref','fr','ft','fq','fv','fk','fqa','it','bd','rq',): #print( "USFX.loadFootnote found", repr(caller), repr(marker), repr(fText), repr(fTail) ) if Globals.debugFlag: assert( marker in ('ref','fr','ft','fq','fv','fk','fqa','it','bd','rq',) ) if marker=='ref': assert( fText ) Globals.checkXMLNoSubelements( subelement, sublocation, 'ls13' ) target = None for attrib,value in subelement.items(): if attrib == 'tgt': target = value else: logging.warning( _("gs35 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) ) if target: self.thisBook.appendToLastLine( ' \\{} {}\\{}*{}'.format( marker, target, marker, fText ) ) else: halt else: Globals.checkXMLNoAttributes( subelement, sublocation, 'dq54' ) self.thisBook.appendToLastLine( ' \\{} {}'.format( marker, fText ) ) if marker[0] == 'f': # Starts with f, e.g., fr, ft for sub2element in subelement: sub2location = sub2element.tag + " of " + sublocation marker2, fText2, fTail2 = sub2element.tag, clean(sub2element.text), clean(sub2element.tail) Globals.checkXMLNoSubelements( sub2element, sub2location, 'js72' ) if marker2=='ref': print( sub2location ) assert( not fText2 ) target = None for attrib,value in sub2element.items(): if attrib == 'tgt': target = value else: logging.warning( _("hd52 Unprocessed {} attribute ({}) in {}").format( attrib, value, sub2location ) ) if target: self.thisBook.appendToLastLine( ' \\{} {}'.format( marker2, target ) ) else: halt else: halt else: halt if fTail: self.thisBook.appendToLastLine( '\\{}*{}'.format( marker, fTail ) ) self.thisBook.appendToLastLine( '\\f*{}'.format( (' '+tail) if tail else '' ) ) # end of USFXXMLBible.loadFootnote def loadCrossreference( self, element, location ): """ Has to handle: <x caller="+"><ref tgt="EXO.30.12">Exodus 30:12</ref></x> """ text, tail = clean(element.text), clean(element.tail) caller = None for attrib,value in element.items(): if attrib == 'caller': caller = value else: logging.warning( _("fhj2 Unprocessed {} attribute ({}) in {}").format( attrib, value, location ) ) self.thisBook.appendToLastLine( ' \\x {}'.format( caller ) ) for subelement in element: sublocation = subelement.tag + " of " + location marker, xText, xTail = subelement.tag, clean(subelement.text), clean(subelement.tail) #print( "USFX.loadCrossreference", repr(caller), repr(text), repr(tail), repr(marker), repr(xText), repr(xTail) ) #if Globals.verbosityLevel > 0 and marker not in ('ref','xo','xt',): #print( "USFX.loadCrossreference found", repr(caller), repr(marker), repr(xText), repr(xTail) ) if Globals.debugFlag: assert( marker in ('ref','xo','xt',) ) if marker=='ref': assert( xText ) Globals.checkXMLNoSubelements( subelement, sublocation, 's1sd' ) target = None for attrib,value in subelement.items(): if attrib == 'tgt': target = value else: logging.warning( _("aj41 Unprocessed {} attribute ({}) in {}").format( attrib, value, sublocation ) ) if target: self.thisBook.appendToLastLine( ' \\{} {}\\{}*{}'.format( marker, target, marker, xText ) ) else: halt else: Globals.checkXMLNoAttributes( subelement, sublocation, 'sc35' ) self.thisBook.appendToLastLine( ' \\{} {}'.format( marker, xText ) ) if marker[0] == 'x': # Starts with x, e.g., xo, xt for sub2element in subelement: sub2location = sub2element.tag + " of " + sublocation marker2, xText2, xTail2 = sub2element.tag, clean(sub2element.text), clean(sub2element.tail) Globals.checkXMLNoSubelements( sub2element, sub2location, 'fs63' ) if marker2=='ref': assert( not xText2 ) target = None for attrib,value in sub2element.items(): if attrib == 'tgt': target = value else: logging.warning( _("gs34 Unprocessed {} attribute ({}) in {}").format( attrib, value, sub2location ) ) if target: self.thisBook.appendToLastLine( ' \\{} {}'.format( marker2, target ) ) else: halt else: halt else: halt if xTail: self.thisBook.appendToLastLine( '\\{}*{}'.format( marker, xTail ) ) self.thisBook.appendToLastLine( '\\x*{}'.format( (' '+tail) if tail else '' ) )
def load( self ): """ Load a single source file and load book elements. """ if Globals.verbosityLevel > 2: print( _("Loading {}...").format( self.sourceFilepath ) ) lastLine, lineCount = '', 0 BBB = None lastBookCode = lastChapterNumber = lastVerseNumber = -1 lastVText = '' with open( self.sourceFilepath, encoding=self.encoding ) as myFile: # Automatically closes the file when done for line in myFile: lineCount += 1 #if lineCount==1 and self.encoding.lower()=='utf-8' and line[0]==chr(65279): #U+FEFF #logging.info( " VPLBible.load: Detected UTF-16 Byte Order Marker" ) #line = line[1:] # Remove the UTF-8 Byte Order Marker if line[-1]=='\n': line=line[:-1] # Removing trailing newline character if not line: continue # Just discard blank lines lastLine = line #print ( 'VLP file line is "' + line + '"' ) if line[0]=='#': continue # Just discard comment lines bits = line.split( ' ', 2 ) #print( self.givenName, BBB, bits ) if len(bits) == 3 and ':' in bits[1]: bookCode, CVString, vText = bits chapterNumberString, verseNumberString = CVString.split( ':' ) else: print( "Unexpected number of bits", self.givenName, BBB, bookCode, chapterNumberString, verseNumberString, len(bits), bits ) if not bookCode and not chapterNumberString and not verseNumberString: print( "Skipping empty line in {} {} {} {}:{}".format( self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) continue if Globals.debugFlag: assert( 2 <= len(bookCode) <= 4 ) if Globals.debugFlag: assert( chapterNumberString.isdigit() ) if Globals.debugFlag: assert( verseNumberString.isdigit() ) chapterNumber = int( chapterNumberString ) verseNumber = int( verseNumberString ) if bookCode != lastBookCode: # We've started a new book if lastBookCode != -1: # Better save the last book self.saveBook( thisBook ) if bookCode == 'Ge': BBB = 'GEN' elif bookCode == 'Le': BBB = 'LEV' elif bookCode == 'Jud': BBB = 'JDG' elif bookCode == 'Es': BBB = 'EST' elif bookCode == 'Pr': BBB = 'PRO' elif bookCode == 'So': BBB = 'SNG' elif bookCode == 'La': BBB = 'LAM' elif bookCode == 'Jude': BBB = 'JDE' else: BBB = Globals.BibleBooksCodes.getBBB( bookCode ) # Try to guess assert( BBB ) thisBook = BibleBook( self, BBB ) thisBook.objectNameString = "VPL Bible Book object" thisBook.objectTypeString = "VPL" lastBookCode = bookCode lastChapterNumber = lastVerseNumber = -1 if chapterNumber != lastChapterNumber: # We've started a new chapter if Globals.debugFlag: assert( chapterNumber > lastChapterNumber or BBB=='ESG' ) # Esther Greek might be an exception if chapterNumber == 0: logging.info( "Have chapter zero in {} {} {} {}:{}".format( self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) thisBook.appendLine( 'c', chapterNumberString ) lastChapterNumber = chapterNumber lastVerseNumber = -1 # Handle special formatting # [brackets] are for Italicized words # <brackets> are for the Words of Christ in Red # «brackets» are for the Titles in the Book of Psalms. vText = vText.replace( '[', '\\add ' ).replace( ']', '\\add*' ) \ .replace( '<', '\\wj ' ).replace( '>', '\\wj*' ) if vText and vText[0]=='«': assert( BBB=='PSA' and verseNumberString=='1' ) vBits = vText[1:].split( '»' ) #print( "vBits", vBits ) thisBook.appendLine( 'd', vBits[0] ) # Psalm title vText = vBits[1].lstrip() # Handle the verse info if verseNumber==lastVerseNumber and vText==lastVText: logging.warning( _("Ignored duplicate verse line in {} {} {} {}:{}").format( self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) continue if BBB=='PSA' and verseNumberString=='1' and vText.startswith('<') and self.givenName=='basic_english': # Move Psalm titles to verse zero verseNumber = 0 if verseNumber < lastVerseNumber: logging.warning( _("Ignored receding verse number (from {} to {}) in {} {} {} {}:{}").format( lastVerseNumber, verseNumber, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) elif verseNumber == lastVerseNumber: if vText == lastVText: logging.warning( _("Ignored duplicated {} verse in {} {} {} {}:{}").format( verseNumber, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) else: logging.warning( _("Ignored duplicated {} verse number in {} {} {} {}:{}").format( verseNumber, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) thisBook.appendLine( 'v', verseNumberString + ' ' + vText ) lastVText = vText lastVerseNumber = verseNumber # Save the final book self.saveBook( thisBook ) self.doPostLoadProcessing()
def load(self): """ Load a single source file and load book elements. """ if Globals.verbosityLevel > 2: print(_("Loading {}...").format(self.sourceFilepath)) lastLine, lineCount = '', 0 BBB = None lastBookNumber = lastChapterNumber = lastVerseNumber = -1 lastVText = '' quoted = None with open(self.sourceFilepath, encoding=self.encoding ) as myFile: # Automatically closes the file when done for line in myFile: lineCount += 1 #if lineCount==1 and self.encoding.lower()=='utf-8' and line[0]==chr(65279): #U+FEFF #logging.info( " CSVBible.load: Detected UTF-16 Byte Order Marker" ) #line = line[1:] # Remove the UTF-8 Byte Order Marker if line[-1] == '\n': line = line[:-1] # Removing trailing newline character if not line: continue # Just discard blank lines if line == ' ': continue # Handle special case which has blanks on every second line -- HACK lastLine = line #print ( "CSV file line {} is {}".format( lineCount, repr(line) ) ) if line[0] == '#': continue # Just discard comment lines if lineCount == 1: if line.startswith('"Book",'): quoted = True continue # Just discard header line elif line.startswith('Book,'): quoted = False continue # Just discard header line bits = line.split(',', 3) #print( lineCount, self.givenName, BBB, bits ) if len(bits) == 4: bString, chapterNumberString, verseNumberString, vText = bits #print( "bString, chapterNumberString, verseNumberString, vText", bString, chapterNumberString, verseNumberString, vText ) else: print("Unexpected number of bits", self.givenName, BBB, bString, chapterNumberString, verseNumberString, vText, len(bits), bits) # Remove quote marks from these strings if quoted: if len(bString) >= 2 and bString[0] == bString[ -1] and bString[0] in '"\'': bString = bString[1:-1] if len(chapterNumberString) >= 2 and chapterNumberString[ 0] == chapterNumberString[ -1] and chapterNumberString[0] in '"\'': chapterNumberString = chapterNumberString[1:-1] if len(verseNumberString) >= 2 and verseNumberString[ 0] == verseNumberString[-1] and verseNumberString[ 0] in '"\'': verseNumberString = verseNumberString[1:-1] if len(vText) >= 2 and vText[0] == vText[-1] and vText[ 0] in '"\'': vText = vText[1:-1] #print( "bString, chapterNumberString, verseNumberString, vText", bString, chapterNumberString, verseNumberString, vText ) #if not bookCode and not chapterNumberString and not verseNumberString: #print( "Skipping empty line in {} {} {} {}:{}".format( self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) #continue #if Globals.debugFlag: assert( 2 <= len(bookCode) <= 4 ) #if Globals.debugFlag: assert( chapterNumberString.isdigit() ) #if Globals.debugFlag: assert( verseNumberString.isdigit() ) bookNumber = int(bString) chapterNumber = int(chapterNumberString) verseNumber = int(verseNumberString) if bookNumber != lastBookNumber: # We've started a new book if lastBookNumber != -1: # Better save the last book self.saveBook(thisBook) BBB = Globals.BibleBooksCodes.getBBBFromReferenceNumber( bookNumber) # Try to guess assert (BBB) thisBook = BibleBook(self, BBB) thisBook.objectNameString = "CSV Bible Book object" thisBook.objectTypeString = "CSV" lastBookNumber = bookNumber lastChapterNumber = lastVerseNumber = -1 if chapterNumber != lastChapterNumber: # We've started a new chapter if Globals.debugFlag: assert (chapterNumber > lastChapterNumber or BBB == 'ESG' ) # Esther Greek might be an exception if chapterNumber == 0: logging.info( "Have chapter zero in {} {} {} {}:{}".format( self.givenName, BBB, bookNumber, chapterNumberString, verseNumberString)) thisBook.appendLine('c', chapterNumberString) lastChapterNumber = chapterNumber lastVerseNumber = -1 # Now we have to convert any possible RTF codes to our internal codes vTextOriginal = vText # First do special characters vText = vText.replace('\\ldblquote', '“').replace( '\\rdblquote', '”').replace('\\lquote', '‘').replace('\\rquote', '’') vText = vText.replace('\\emdash', '—').replace('\\endash', '–') # Now do Unicode characters while True: # Find patterns like \\'d3 match = re.search(r"\\'[0-9a-f][0-9a-f]", vText) if not match: break i = int(vText[match.start() + 2:match.end()], 16) # Convert two hex characters to decimal vText = vText[:match.start()] + chr( i) + vText[match.end():] while True: # Find patterns like \\u253? match = re.search(r"\\u[1-2][0-9][0-9]\?", vText) if not match: break i = int(vText[match.start() + 2:match.end() - 1]) # Convert three digits to decimal vText = vText[:match.start()] + chr( i) + vText[match.end():] #if vText != vTextOriginal: print( repr(vTextOriginal) ); print( repr(vText) ) ## Handle special formatting ## [brackets] are for Italicized words ## <brackets> are for the Words of Christ in Red ## «brackets» are for the Titles in the Book of Psalms. #vText = vText.replace( '[', '\\add ' ).replace( ']', '\\add*' ) \ #.replace( '<', '\\wj ' ).replace( '>', '\\wj*' ) #if vText and vText[0]=='«': #assert( BBB=='PSA' and verseNumberString=='1' ) #vBits = vText[1:].split( '»' ) ##print( "vBits", vBits ) #thisBook.appendLine( 'd', vBits[0] ) # Psalm title #vText = vBits[1].lstrip() # Handle the verse info if verseNumber == lastVerseNumber and vText == lastVText: logging.warning( _("Ignored duplicate verse line in {} {} {} {}:{}"). format(self.givenName, BBB, bookCode, chapterNumberString, verseNumberString)) continue if BBB == 'PSA' and verseNumberString == '1' and vText.startswith( '<') and self.givenName == 'basic_english': # Move Psalm titles to verse zero verseNumber = 0 if verseNumber < lastVerseNumber: logging.warning( _("Ignored receding verse number (from {} to {}) in {} {} {} {}:{}" ).format(lastVerseNumber, verseNumber, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString)) elif verseNumber == lastVerseNumber: if vText == lastVText: logging.warning( _("Ignored duplicated {} verse in {} {} {} {}:{}"). format(verseNumber, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString)) else: logging.warning( _("Ignored duplicated {} verse number in {} {} {} {}:{}" ).format(verseNumber, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString)) thisBook.appendLine('v', verseNumberString + ' ' + vText) lastVText = vText lastVerseNumber = verseNumber # Save the final book self.saveBook(thisBook) self.doPostLoadProcessing()
def load( self ): """ Load a single source file and load book elements. """ if Globals.verbosityLevel > 2: print( _("Loading {}...").format( self.sourceFilepath ) ) lastLine, lineCount = '', 0 BBB = None lastBookNumber = lastChapterNumber = lastVerseNumber = -1 lastVText = '' quoted = None with open( self.sourceFilepath, encoding=self.encoding ) as myFile: # Automatically closes the file when done for line in myFile: lineCount += 1 #if lineCount==1 and self.encoding.lower()=='utf-8' and line[0]==chr(65279): #U+FEFF #logging.info( " CSVBible.load: Detected UTF-16 Byte Order Marker" ) #line = line[1:] # Remove the UTF-8 Byte Order Marker if line[-1]=='\n': line=line[:-1] # Removing trailing newline character if not line: continue # Just discard blank lines if line==' ': continue # Handle special case which has blanks on every second line -- HACK lastLine = line #print ( "CSV file line {} is {}".format( lineCount, repr(line) ) ) if line[0]=='#': continue # Just discard comment lines if lineCount==1: if line.startswith( '"Book",' ): quoted = True continue # Just discard header line elif line.startswith( 'Book,' ): quoted = False continue # Just discard header line bits = line.split( ',', 3 ) #print( lineCount, self.givenName, BBB, bits ) if len(bits) == 4: bString, chapterNumberString, verseNumberString, vText = bits #print( "bString, chapterNumberString, verseNumberString, vText", bString, chapterNumberString, verseNumberString, vText ) else: print( "Unexpected number of bits", self.givenName, BBB, bString, chapterNumberString, verseNumberString, vText, len(bits), bits ) # Remove quote marks from these strings if quoted: if len(bString)>=2 and bString[0]==bString[-1] and bString[0] in '"\'': bString = bString[1:-1] if len(chapterNumberString)>=2 and chapterNumberString[0]==chapterNumberString[-1] and chapterNumberString[0] in '"\'': chapterNumberString = chapterNumberString[1:-1] if len(verseNumberString)>=2 and verseNumberString[0]==verseNumberString[-1] and verseNumberString[0] in '"\'': verseNumberString = verseNumberString[1:-1] if len(vText)>=2 and vText[0]==vText[-1] and vText[0] in '"\'': vText = vText[1:-1] #print( "bString, chapterNumberString, verseNumberString, vText", bString, chapterNumberString, verseNumberString, vText ) #if not bookCode and not chapterNumberString and not verseNumberString: #print( "Skipping empty line in {} {} {} {}:{}".format( self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) #continue #if Globals.debugFlag: assert( 2 <= len(bookCode) <= 4 ) #if Globals.debugFlag: assert( chapterNumberString.isdigit() ) #if Globals.debugFlag: assert( verseNumberString.isdigit() ) bookNumber = int( bString ) chapterNumber = int( chapterNumberString ) verseNumber = int( verseNumberString ) if bookNumber != lastBookNumber: # We've started a new book if lastBookNumber != -1: # Better save the last book self.saveBook( thisBook ) BBB = Globals.BibleBooksCodes.getBBBFromReferenceNumber( bookNumber ) # Try to guess assert( BBB ) thisBook = BibleBook( self, BBB ) thisBook.objectNameString = "CSV Bible Book object" thisBook.objectTypeString = "CSV" lastBookNumber = bookNumber lastChapterNumber = lastVerseNumber = -1 if chapterNumber != lastChapterNumber: # We've started a new chapter if Globals.debugFlag: assert( chapterNumber > lastChapterNumber or BBB=='ESG' ) # Esther Greek might be an exception if chapterNumber == 0: logging.info( "Have chapter zero in {} {} {} {}:{}".format( self.givenName, BBB, bookNumber, chapterNumberString, verseNumberString ) ) thisBook.appendLine( 'c', chapterNumberString ) lastChapterNumber = chapterNumber lastVerseNumber = -1 # Now we have to convert any possible RTF codes to our internal codes vTextOriginal = vText # First do special characters vText = vText.replace( '\\ldblquote', '“' ).replace( '\\rdblquote', '”' ).replace( '\\lquote', '‘' ).replace( '\\rquote', '’' ) vText = vText.replace( '\\emdash', '—' ).replace( '\\endash', '–' ) # Now do Unicode characters while True: # Find patterns like \\'d3 match = re.search( r"\\'[0-9a-f][0-9a-f]", vText ) if not match: break i = int( vText[match.start()+2:match.end()], 16 ) # Convert two hex characters to decimal vText = vText[:match.start()] + chr( i ) + vText[match.end():] while True: # Find patterns like \\u253? match = re.search( r"\\u[1-2][0-9][0-9]\?", vText ) if not match: break i = int( vText[match.start()+2:match.end()-1] ) # Convert three digits to decimal vText = vText[:match.start()] + chr( i ) + vText[match.end():] #if vText != vTextOriginal: print( repr(vTextOriginal) ); print( repr(vText) ) ## Handle special formatting ## [brackets] are for Italicized words ## <brackets> are for the Words of Christ in Red ## «brackets» are for the Titles in the Book of Psalms. #vText = vText.replace( '[', '\\add ' ).replace( ']', '\\add*' ) \ #.replace( '<', '\\wj ' ).replace( '>', '\\wj*' ) #if vText and vText[0]=='«': #assert( BBB=='PSA' and verseNumberString=='1' ) #vBits = vText[1:].split( '»' ) ##print( "vBits", vBits ) #thisBook.appendLine( 'd', vBits[0] ) # Psalm title #vText = vBits[1].lstrip() # Handle the verse info if verseNumber==lastVerseNumber and vText==lastVText: logging.warning( _("Ignored duplicate verse line in {} {} {} {}:{}").format( self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) continue if BBB=='PSA' and verseNumberString=='1' and vText.startswith('<') and self.givenName=='basic_english': # Move Psalm titles to verse zero verseNumber = 0 if verseNumber < lastVerseNumber: logging.warning( _("Ignored receding verse number (from {} to {}) in {} {} {} {}:{}").format( lastVerseNumber, verseNumber, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) elif verseNumber == lastVerseNumber: if vText == lastVText: logging.warning( _("Ignored duplicated {} verse in {} {} {} {}:{}").format( verseNumber, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) else: logging.warning( _("Ignored duplicated {} verse number in {} {} {} {}:{}").format( verseNumber, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) thisBook.appendLine( 'v', verseNumberString + ' ' + vText ) lastVText = vText lastVerseNumber = verseNumber # Save the final book self.saveBook( thisBook ) self.doPostLoadProcessing()
def load( self ): """ Load a single source file and load book elements. """ if Globals.verbosityLevel > 2: print( _("Loading {}...").format( self.sourceFilepath ) ) fileExtensionUpper = self.fileExtension.upper() if fileExtensionUpper not in filenameEndingsToAccept: logging.critical( "{} doesn't appear to be a MySword file".format( self.sourceFilename ) ) elif not self.sourceFilename.upper().endswith( BibleFilenameEndingsToAccept[0] ): logging.critical( "{} doesn't appear to be a MySword Bible file".format( self.sourceFilename ) ) connection = sqlite3.connect( self.sourceFilepath ) connection.row_factory = sqlite3.Row # Enable row names cursor = connection.cursor() # First get the settings cursor.execute( 'select * from Details' ) row = cursor.fetchone() for key in row.keys(): self.settingsDict[key] = row[key] #print( self.settingsDict ); halt if 'Description' in self.settingsDict and len(self.settingsDict['Description'])<40: self.name = self.settingsDict['Description'] if 'Abbreviation' in self.settingsDict: self.abbreviation = self.settingsDict['Abbreviation'] if 'encryption' in self.settingsDict: logging.critical( "{} is encrypted: level {}".format( self.sourceFilename, self.settingsDict['encryption'] ) ) if self.settingsDict['OT'] and self.settingsDict['NT']: testament, BBB = 'BOTH', 'GEN' booksExpected, textLineCountExpected = 66, 31102 elif self.settingsDict['OT']: testament, BBB = 'OT', 'GEN' booksExpected, textLineCountExpected = 39, 23145 elif self.settingsDict['NT']: testament, BBB = 'NT', 'MAT' booksExpected, textLineCountExpected = 27, 7957 BOS = BibleOrganizationalSystem( "GENERIC-KJV-66-ENG" ) # Create the first book thisBook = BibleBook( self.name, BBB ) thisBook.objectNameString = "MySword Bible Book object" thisBook.objectTypeString = "MySword" verseList = BOS.getNumVersesList( BBB ) numC, numV = len(verseList), verseList[0] nBBB = Globals.BibleBooksCodes.getReferenceNumber( BBB ) C = V = 1 bookCount = 0 ourGlobals = {} continued = ourGlobals['haveParagraph'] = False haveLines = False while True: cursor.execute('select Scripture from Bible where Book=? and Chapter=? and Verse=?', (nBBB,C,V) ) try: row = cursor.fetchone() line = row[0] except: # This reference is missing #print( "something wrong at", BBB, C, V ) #if Globals.debugFlag: halt #print( row ) line = None #print ( nBBB, BBB, C, V, 'MySw file line is "' + line + '"' ) if line is None: logging.warning( "MySwordBible.load: Found missing verse line at {} {}:{}".format( BBB, C, V ) ) else: # line is not None if not isinstance( line, str ): if 'encryption' in self.settingsDict: logging.critical( "MySwordBible.load: Unable to decrypt verse line at {} {}:{} {}".format( BBB, C, V, repr(line) ) ) break else: logging.critical( "MySwordBible.load: Unable to decode verse line at {} {}:{} {} {}".format( BBB, C, V, repr(line), self.settingsDict ) ) elif not line: logging.warning( "MySwordBible.load: Found blank verse line at {} {}:{}".format( BBB, C, V ) ) else: haveLines = True # Some modules end lines with \r\n or have it in the middle! # (We just ignore these for now) if '\r' in line or '\n' in line: logging.warning( "MySwordBible.load: Found CR or LF characters in verse line at {} {}:{}".format( BBB, C, V ) ) while line and line[-1] in '\r\n': line = line[:-1] line = line.replace( '\r\n', ' ' ).replace( '\r', ' ' ).replace( '\n', ' ' ) #print( "MySword.load", BBB, C, V, repr(line) ) handleLine( self.name, BBB, C, V, line, thisBook, ourGlobals ) V += 1 if V > numV: C += 1 if C > numC: # Save this book now if haveLines: if Globals.verbosityLevel > 3: print( "Saving", BBB, bookCount+1 ) self.saveBook( thisBook ) #else: print( "Not saving", BBB ) bookCount += 1 # Not the number saved but the number we attempted to process if bookCount >= booksExpected: break BBB = BOS.getNextBookCode( BBB ) # Create the next book thisBook = BibleBook( self.name, BBB ) thisBook.objectNameString = "MySword Bible Book object" thisBook.objectTypeString = "MySword" haveLines = False verseList = BOS.getNumVersesList( BBB ) numC, numV = len(verseList), verseList[0] nBBB = Globals.BibleBooksCodes.getReferenceNumber( BBB ) C = V = 1 #thisBook.appendLine( 'c', str(C) ) else: # next chapter only #thisBook.appendLine( 'c', str(C) ) numV = verseList[C-1] V = 1 if ourGlobals['haveParagraph']: thisBook.appendLine( 'p', '' ) ourGlobals['haveParagraph'] = False cursor.close() self.doPostLoadProcessing()
def load( self ): """ Load a single source file and load book elements. """ if Globals.verbosityLevel > 2: print( _("Loading {}...").format( self.sourceFilepath ) ) def decodeVerse( encodedVerseString ): """ Decodes the verse which has @ format codes. """ verseString = encodedVerseString if verseString.startswith( '@@' ): # This simply means that encoding follows verseString = verseString[2:] if verseString.startswith( '@@' ): # This simply means that encoding follows verseString = verseString[2:] # Paragraph markers (marked now with double backslash) verseString = verseString.replace( '@^', '\\\\p ' ) verseString = verseString.replace( '@0', '\\\\m ' ) verseString = verseString.replace( '@1', '\\\\q1 ' ).replace( '@2', '\\\\q2 ' ).replace( '@3', '\\\\q3 ' ).replace( '@4', '\\q4 ' ) verseString = verseString.replace( '@8', '\\\\m ' ) # Character markers (marked now with single backslash) verseString = verseString.replace( '@6', '\\wj ' ).replace( '@5', '\\wj*' ) verseString = verseString.replace( '@9', '\\add ' ).replace( '@7', '\\add*' ) # or \\i ??? verseString = re.sub( r'@<f([0-9])@>@/', r'\\ff\1', verseString ) verseString = re.sub( r'@<x([0-9])@>@/', r'\\xx\1', verseString ) #print( repr( verseString ) ) assert( '@' not in verseString ) return verseString # end of decodeVerse # Read all the lines into bookDict lastLine, lineCount = '', 0 bookNameDict, bookDict, footnoteDict, xrefDict, headingDict = OrderedDict(), OrderedDict(), {}, {}, {} BBB = bookNumberString = chapterNumberString = verseNumberString = encodedVerseString = '' lastBBB = lastBookNumberString = lastChapterNumberString = lastVerseNumberString = None with open( self.sourceFilepath, encoding=self.encoding ) as myFile: # Automatically closes the file when done for line in myFile: lineCount += 1 #if lineCount==1 and self.encoding.lower()=='utf-8' and line[0]==chr(65279): #U+FEFF #logging.info( " YETBible.load: Detected UTF-16 Byte Order Marker" ) #line = line[1:] # Remove the UTF-8 Byte Order Marker if line[-1]=='\n': line=line[:-1] # Removing trailing newline character if not line: continue # Just discard blank lines lastLine = line #print ( 'YETBible file line is "' + line + '"' ) bits = line.split( '\t' ) #print( self.givenName, BBB, bits ) if bits[0] == 'info': assert( len(bits) == 3 ) if bits[1] == 'shortName': shortName = bits[2] self.name = shortName elif bits[1] == 'longName': longName = bits[2] elif bits[1] == 'description': description = bits[2] elif bits[1] == 'locale': locale = bits[2] assert( 2 <= len(locale) <= 3 ) if locale == 'in': locale = 'id' # Fix a quirk in the locale encoding else: logging.warning( _("YETBible: unknown {} info field in {} {} {}:{}") \ .format( repr(bits[1]), BBB, bookCode, chapterNumberString, verseNumberString ) ) continue elif bits[0] == 'book_name': assert( 3 <= len(bits) <= 4 ) thisBBB = Globals.BibleBooksCodes.getBBBFromReferenceNumber( bits[1] ) if len(bits) == 3: bookNameDict[thisBBB] = bits[2], '' elif len(bits) == 4: bookNameDict[thisBBB] = bits[2], bits[3] continue elif bits[0] == 'verse': assert( len(bits) == 5 ) bookNumberString, chapterNumberString, verseNumberString, encodedVerseString = bits[1:] if Globals.debugFlag: assert( bookNumberString.isdigit() ) assert( chapterNumberString.isdigit() ) assert( verseNumberString.isdigit() ) BBB = Globals.BibleBooksCodes.getBBBFromReferenceNumber( bookNumberString ) #print( "{} {}:{} = {}".format( BBB, chapterNumberString, verseNumberString, repr(encodedVerseString) ) ) if BBB != lastBBB: # We have a new book if lastBBB is not None: # We have a completed book to save bookDict[lastBBB] = bookLines assert( BBB in bookNameDict ) bookLines = OrderedDict() # Keys are (C,V) strings verseString = decodeVerse( encodedVerseString ) bookLines[(chapterNumberString,verseNumberString)] = verseString # Just store it for now lastBBB = BBB continue elif bits[0] == 'pericope': assert( len(bits) == 5 ) bookNumberString, chapterNumberString, verseNumberString, encodedHeadingString = bits[1:] if Globals.debugFlag: assert( bookNumberString.isdigit() ) assert( chapterNumberString.isdigit() ) assert( verseNumberString.isdigit() ) BBB = Globals.BibleBooksCodes.getBBBFromReferenceNumber( bookNumberString ) headingString = encodedHeadingString.replace( '@9', '\\it ' ).replace( '@7', '\\it*' ) #print( repr(encodedHeadingString), repr(headingString) ) assert( '@' not in headingString ) headingDict[(BBB,chapterNumberString,verseNumberString)] = headingString, [] # Blank refList continue elif bits[0] == 'parallel': # These lines optionally follow pericope lines assert( len(bits) == 2 ) heading, refList = headingDict[(BBB,chapterNumberString,verseNumberString)] refList.append( bits[1] ) #print( "parallel2", repr(heading), refList ) headingDict[(BBB,chapterNumberString,verseNumberString)] = heading, refList continue elif bits[0] == 'xref': assert( len(bits) == 6 ) bookNumberString, chapterNumberString, verseNumberString, indexNumberString, encodedNoteString = bits[1:] if Globals.debugFlag: assert( bookNumberString.isdigit() ) assert( chapterNumberString.isdigit() ) assert( verseNumberString.isdigit() ) assert( indexNumberString.isdigit() ) BBB = Globals.BibleBooksCodes.getBBBFromReferenceNumber( bookNumberString ) noteString = encodedNoteString.replace( '@9', '\\it ' ).replace( '@7', '\\it*' ) noteString = re.sub( r'@<ta(.+?)@>', r'', noteString ) # Get rid of these encoded BCV references for now noteString = re.sub( r'@<to(.+?)@>', r'', noteString ) # Get rid of these OSIS BCV references for now noteString = noteString.replace( '@/', '' ) #print( repr(encodedNoteString), repr(noteString) ) assert( '@' not in noteString ) xrefDict[(BBB,chapterNumberString,verseNumberString,indexNumberString)] = noteString continue elif bits[0] == 'footnote': assert( len(bits) == 6 ) bookNumberString, chapterNumberString, verseNumberString, indexNumberString, encodedNoteString = bits[1:] if Globals.debugFlag: assert( bookNumberString.isdigit() ) assert( chapterNumberString.isdigit() ) assert( verseNumberString.isdigit() ) assert( indexNumberString.isdigit() ) BBB = Globals.BibleBooksCodes.getBBBFromReferenceNumber( bookNumberString ) noteString = encodedNoteString.replace( '@9', '\\it ' ).replace( '@7', '\\it*' ) assert( '@' not in noteString ) footnoteDict[(BBB,chapterNumberString,verseNumberString,indexNumberString)] = noteString continue else: print( "YETBible: Unknown line type", self.givenName, BBB, bookCode, chapterNumberString, verseNumberString, len(bits), bits ); halt bookDict[lastBBB] = bookLines # Save the last book #if bookCode != lastBookCode: # We've started a new book #if lastBookCode != -1: # Better save the last book #self.saveBook( thisBook ) #BBB = Globals.BibleBooksCodes.getBBBFromYETBibleCode( bookCode ) #thisBook = BibleBook( self.name, BBB ) #thisBook.objectNameString = "YET Bible Book object" #thisBook.objectTypeString = "YET" #lastBookCode = bookCode #lastChapterNumber = lastVerseNumber = -1 #if chapterNumber != lastChapterNumber: # We've started a new chapter #if Globals.debugFlag: assert( chapterNumber > lastChapterNumber or BBB=='ESG' ) # Esther Greek might be an exception #if chapterNumber == 0: #logging.info( "Have chapter zero in {} {} {} {}:{}".format( self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) #thisBook.appendLine( 'c', chapterNumberString ) #lastChapterNumber = chapterNumber #lastVerseNumber = -1 ## Handle the verse info #if verseNumber==lastVerseNumber and vText==lastVText: #logging.warning( _("Ignored duplicate verse line in {} {} {} {}:{}").format( self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) #continue #if BBB=='PSA' and verseNumberString=='1' and vText.startswith('<') and self.givenName=='basic_english': ## Move Psalm titles to verse zero #verseNumber = 0 #if verseNumber < lastVerseNumber: #logging.warning( _("Ignored receding verse number (from {} to {}) in {} {} {} {}:{}").format( lastVerseNumber, verseNumber, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) #elif verseNumber == lastVerseNumber: #if vText == lastVText: #logging.warning( _("Ignored duplicated {} verse in {} {} {} {}:{}").format( verseNumber, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) #else: #logging.warning( _("Ignored duplicated {} verse number in {} {} {} {}:{}").format( verseNumber, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) #thisBook.appendLine( 'v', verseNumberString + ' ' + vText ) #lastVText = vText #lastVerseNumber = verseNumber # Now process the books for BBB,bkData in bookDict.items(): #print( "Processing", BBB ) thisBook = BibleBook( self.name, BBB ) thisBook.objectNameString = "YET Bible Book object" thisBook.objectTypeString = "YET" lastChapterNumberString = None for (chapterNumberString,verseNumberString), verseString in bkData.items(): # Insert headings (can only occur before verses) if (BBB,chapterNumberString,verseNumberString) in headingDict: heading, refList = headingDict[(BBB,chapterNumberString,verseNumberString)] #print( 's', BBB, chapterNumberString, verseNumberString, repr(heading), refList ) thisBook.appendLine( 's', heading ) if refList: refString = "" #print( 's', BBB, chapterNumberString, verseNumberString, repr(heading), refList ) for ref in refList: refString += ('; ' if refString else '') + ref #print( 's', BBB, chapterNumberString, verseNumberString, repr(heading), refList, repr(refString) ) thisBook.appendLine( 'r', '('+refString+')' ) # Insert footnotes and cross-references while( '\\ff' in verseString ): #print( "footnote", repr(verseString) ) fIx = verseString.index( '\\ff' ) caller = verseString[fIx+3] #print( "fcaller", repr(caller) ) assert( caller.isdigit() ) note = footnoteDict[(BBB,chapterNumberString,verseNumberString,caller)] #print( "fnote", repr(note) ) verseString = verseString[:fIx] + '\\f + \\ft ' + note + '\\f*' + verseString[fIx+4:] #print( "fvS", repr(verseString) ) while( '\\xx' in verseString ): #print( "xref", repr(verseString) ) fIx = verseString.index( '\\xx' ) caller = verseString[fIx+3] #print( "xcaller", repr(caller) ) assert( caller.isdigit() ) note = xrefDict[(BBB,chapterNumberString,verseNumberString,caller)] #print( "xnote", repr(note) ) verseString = verseString[:fIx] + '\\x - \\xt ' + note + '\\x*' + verseString[fIx+4:] #print( "xvS", repr(verseString) ) # Save the Bible data fields if chapterNumberString != lastChapterNumberString: thisBook.appendLine( 'c', chapterNumberString ) lastChapterNumberString = chapterNumberString #print( BBB, chapterNumberString, verseNumberString, repr(verseString) ) if verseString.startswith( '\\\\' ): # It's an initial paragraph marker if verseString[3]==' ': marker, verseString = verseString[2], verseString[4:] elif verseString[4]==' ': marker, verseString = verseString[2:4], verseString[5:] else: halt #print( '', '\\'+marker ) thisBook.appendLine( marker, '' ) assert( not verseString.startswith( '\\\\' ) ) bits = verseString.split( '\\\\' ) # Split on paragraph markers (but not character markers) for j,bit in enumerate(bits): #print( "loop", j, repr(bit), repr(verseString) ) if j==0: thisBook.appendLine( 'v', verseNumberString + ' ' + verseString.rstrip() ) else: if bit[1]==' ': marker, bit = bit[0], bit[2:] elif bit[2]==' ': marker, bit = bit[0:2], bit[3:] else: halt #print( "mV", marker, repr(bit), repr(verseString) ) thisBook.appendLine( marker, bit.rstrip() ) self.saveBook( thisBook ) self.doPostLoadProcessing()
def load( self ): """ Load a single source file and load book elements. """ if Globals.verbosityLevel > 2: print( _("Loading {}...").format( self.sourceFilepath ) ) status = 0 # 1 = getting chapters, 2 = getting verse data lastLine, lineCount = '', 0 BBB = lastBBB = None bookDetails = {} with open( self.sourceFilepath, encoding=self.encoding ) as myFile: # Automatically closes the file when done for line in myFile: lineCount += 1 if lineCount==1 and self.encoding.lower()=='utf-8' and line[0]==chr(65279): #U+FEFF logging.info( " DrupalBible.load: Detected UTF-16 Byte Order Marker" ) line = line[1:] # Remove the UTF-8 Byte Order Marker if line[-1]=='\n': line=line[:-1] # Removing trailing newline character if not line: continue # Just discard blank lines #print ( 'DB file line is "' + line + '"' ) if line[0] == '#': continue # Just discard comment lines lastLine = line if lineCount == 1: if line != '*Bible': logging.warning( "Unknown DrupalBible first line: {}".format( repr(line) ) ) elif status == 0: if line == '*Chapter': status = 1 else: # Get the version name details bits = line.split( '|' ) shortName, fullName, language = bits self.name = fullName elif status == 1: if line == '*Context': status = 2 else: # Get the book name details bits = line.split( '|' ) bookCode, bookFullName, bookShortName, numChapters = bits assert( bookShortName == bookCode ) BBBresult = Globals.BibleBooksCodes.getBBBFromDrupalBibleCode( bookCode ) BBB = BBBresult if isinstance( BBBresult, str ) else BBBresult[0] # Result can be string or list of strings (best guess first) bookDetails[BBB] = bookFullName, bookShortName, numChapters elif status == 2: # Get the verse text bits = line.split( '|' ) bookCode, chapterNumberString, verseNumberString, lineMark, verseText = bits #chapterNumber, verseNumber = int( chapterNumberString ), int( verseNumberString ) if lineMark: print( repr(lineMark) ); halt BBBresult = Globals.BibleBooksCodes.getBBBFromDrupalBibleCode( bookCode ) BBB = BBBresult if isinstance( BBBresult, str ) else BBBresult[0] # Result can be string or list of strings (best guess first) if BBB != lastBBB: if lastBBB is not None: self.saveBook( thisBook ) thisBook = BibleBook( self.name, BBB ) thisBook.objectNameString = "DrupalBible Bible Book object" thisBook.objectTypeString = "DrupalBible" lastChapterNumberString = None lastBBB = BBB if chapterNumberString != lastChapterNumberString: thisBook.appendLine( 'c', chapterNumberString ) lastChapterNumberString = chapterNumberString verseText = verseText.replace( '<', '\\it ' ).replace( '>', '\\it*' ) thisBook.appendLine( 'v', verseNumberString + ' ' + verseText ) else: halt # Save the final book self.saveBook( thisBook ) self.doPostLoadProcessing()
class GreekNT( Bible ): """ Class for handling a Greek NT object (which may contain one or more Bible books) Note: BBB is used in this class to represent the three-character referenceAbbreviation. """ def __init__( self, sourceFilepath, givenName=None, encoding='utf-8' ): """ Constructor: expects the filepath of the source folder. Loads (and crudely validates the file(s)) into ???. """ # Setup and initialise the base class first Bible.__init__( self ) self.objectNameString = "Greek NT Bible object" self.objectTypeString = "GreekNT" # Now we can set our object variables self.sourceFilepath, self.givenName, self.encoding = sourceFilepath, givenName, encoding self.title = self.version = self.date = None self.tree = self.header = self.frontMatter = self.divs = self.divTypesString = None #self.bkData, self.USFMBooks = OrderedDict(), OrderedDict() self.lang = self.language = None # Do a preliminary check on the readability of our files self.possibleFilenames = [] if os.path.isdir( self.sourceFilepath ): # We've been given a folder -- see if we can find the files # There's no standard for OSIS xml file naming fileList = os.listdir( self.sourceFilepath ) #print( len(fileList), fileList ) # First try looking for OSIS book names for filename in fileList: if filename.lower().endswith('.txt'): thisFilepath = os.path.join( self.sourceFilepath, filename ) #if Globals.debugFlag: print( "Trying {}...".format( thisFilepath ) ) if os.access( thisFilepath, os.R_OK ): # we can read that file self.possibleFilenames.append( filename ) elif not os.access( self.sourceFilepath, os.R_OK ): logging.critical( "GreekNT: File '{}' is unreadable".format( self.sourceFilepath ) ) return # No use continuing #print( self.possibleFilenames ); halt self.name = self.givenName #gNTfc = GreekNTFileConverter( self.sourceFilepath ) # Load and process the XML #gNTfc.loadMorphGNT() #self.books = gNTfc.bookData # end of __init__ #def x__str__( self ): #""" #This method returns the string representation of a Bible book code. #@return: the name of a Bible object formatted as a string #@rtype: string #""" #result = "Greek Bible converter object" ##if self.title: result += ('\n' if result else '') + self.title ##if self.version: result += ('\n' if result else '') + "Version: {} ".format( self.version ) ##if self.date: result += ('\n' if result else '') + "Date: {}".format( self.date ) #if len(self.books)==1: #for BBB in self.books: break # Just get the first one #result += ('\n' if result else '') + " " + _("Contains one book: {}").format( BBB ) #else: result += ('\n' if result else '') + " " + _("Number of books = {}").format( len(self.books) ) #return result ## end of __str__ def load( self ): if Globals.verbosityLevel > 2: print( "Loading Greek NT from {}...".format( self.sourceFilepath ) ) for BBB in Greek.morphgntBooks: self.loadBook( BBB, Greek.morphgntFilenames[BBB] ) if Globals.verbosityLevel > 3: print( "{} books loaded.".format( len(self.books) ) ) #if self.possibleFilenames: # then we possibly have multiple files, probably one for each book #for filename in self.possibleFilenames: #pathname = os.path.join( self.sourceFilepath, filename ) #self.loadBook( pathname ) #else: # most often we have all the Bible books in one file #self.loadFile( self.sourceFilepath ) self.doPostLoadProcessing() # end of load def loadBook( self, BBB, filename, encoding='utf-8' ): def unpackLine( line ): # Should be seven parts in the line # 0 book/chapter/verse # 1 part of speech (POS) # 2 parsing code # 3 text (including punctuation) # 4 word (with punctuation stripped) # 5 normalized word # 6 lemma # e.g., 180101 N- ----NSM- Παῦλος Παῦλος Παῦλος Παῦλος # 180102 N- ----DSF- ⸀ἀδελφῇ ἀδελφῇ ἀδελφῇ ἀδελφή # 180102 P- -------- κατ’ κατ’ κατά κατά # 180102 N- ----DSF- ἐκκλησίᾳ· ἐκκλησίᾳ ἐκκλησίᾳ ἐκκλησία bits = line.split() assert( len(bits) == 7 ) #print( bits ) bn, cn, vn = bits[0][0:2], bits[0][2:4], bits[0][4:6] if bn[0]=='0': bn = bn[1:] # Remove any leading zero if cn[0]=='0': cn = cn[1:] # Remove any leading zero if vn[0]=='0': vn = vn[1:] # Remove any leading zero #print( b, c, v ) POSCode = bits[1] assert( len(POSCode) == 2 ) assert( POSCode in Greek.POSCodes.keys() ) parsingCode = bits[2] assert( len(parsingCode) == 8 ) #print( parsingCode ) for j,char in enumerate(parsingCode): assert( char in Greek.parsingCodes[j] ) assert( parsingCode[0] in Greek.personCodes ) assert( parsingCode[1] in Greek.tenseCodes ) assert( parsingCode[2] in Greek.voiceCodes ) assert( parsingCode[3] in Greek.modeCodes ) assert( parsingCode[4] in Greek.caseCodes ) assert( parsingCode[5] in Greek.numberCodes ) assert( parsingCode[6] in Greek.genderCodes ) assert( parsingCode[7] in Greek.degreeCodes ) return (bn,cn,vn,), (POSCode,parsingCode,), (bits[3],bits[4],bits[5],bits[6],) # end of unpackLine self.thisBook = BibleBook( self.name, BBB ) self.thisBook.objectNameString = "Morph Greek NT Bible Book object" self.thisBook.objectTypeString = "MorphGNT" filepath = os.path.join( self.sourceFilepath, filename ) if Globals.verbosityLevel > 2: print( " Loading {}...".format( filename ) ) lastLine, lineCount = '', 0 lastC = lastV = None with open( filepath, encoding=encoding ) as myFile: # Automatically closes the file when done if 1: #try: for line in myFile: lineCount += 1 if lineCount==1 and encoding.lower()=='utf-8' and line and line[0]==chr(65279): #U+FEFF logging.info( "GreekNT: Detected UTF-16 Byte Order Marker in {}".format( filename ) ) line = line[1:] # Remove the UTF-8 Byte Order Marker if line[-1]=='\n': line = line[:-1] # Removing trailing newline character #if not line: continue # Just discard blank lines lastLine = line #print ( 'gNT file line is "' + line + '"' ) #if line[0]=='#': continue # Just discard comment lines unpackedLine = unpackLine( line ) #print( unpackedLine ) ref, grammar, words = unpackedLine bn, cn, vn = ref POSCode, parsingCode = grammar word1, word2, word3, word4 = words if cn != lastC: self.thisBook.appendLine( 'c', cn ) lastC, lastV = cn, None if vn != lastV: self.thisBook.appendLine( 'v', vn ) lastV = vn self.thisBook.appendLine( 'vw', "{}/{}/{}/{}".format( word1, word2, word3, word4 ) ) self.thisBook.appendLine( 'g', "{}/{}".format( POSCode, parsingCode ) ) #reference = BBB,bits[0][1],bits[0][2], # Put the BBB into the reference #lineTuples.append( (reference,bits[1],bits[2],) ) #print( reference,bits[1],bits[2] ); halt if 0: #except: logging.critical( "Invalid line in " + filepath + " -- line ignored at " + str(lineCount) ) if lineCount > 1: print( 'Previous line was: ', lastLine ) else: print( 'Possible encoding error -- expected', encoding ) if self.thisBook: if Globals.verbosityLevel > 3: print( " {} words loaded from {}".format( len(self.thisBook), filename ) ) self.saveBook( self.thisBook ) #self.books[BBB] = self.thisBook # end of loadBook def xanalyzeWords( self ): """ Go through the NT data and do some filing and sorting of the Greek words. """ if Globals.verbosityLevel > 3: print( "analyzeWords: have {} books in the loaded NT".format( len(self.books) ) ) self.wordCounts = {} # Wordcount organized by BBB self.wordCounts['Total'] = 0 self.actualWordsToNormalized, self.normalizedWordsToActual, self.normalizedWordsToParsing, self.lemmasToNormalizedWords = {}, {}, {}, {} for BBB in self.books: wordCount = len(self.books[BBB]) self.wordCounts[BBB] = wordCount self.wordCounts['Total'] += wordCount if Globals.verbosityLevel > 3: print( " analyzeWords: {} has {} Greek words".format( BBB, wordCount ) ) for reference,parsing,(punctuatedWord,actualWord,normalizedWord,lemma) in self.books[BBB]: # Stuff is: reference,parsing,words # File the actual words if actualWord not in self.actualWordsToNormalized: self.actualWordsToNormalized[actualWord] = [([reference],normalizedWord,)] #print( "Saved", actualWord, "with", self.actualWordsToNormalized[actualWord] ) else: # we've already had this word before previous = self.actualWordsToNormalized[actualWord] #print( "had", actualWord, "before with", previous, "now with", reference, normalizedWord ) found = changed = False newList = [] for oldRefList,oldnormalizedWord in previous: #print( " oRL", oldRefList, "oP", oldnormalizedWord ) if normalizedWord == oldnormalizedWord: assert( not found ) if reference not in oldRefList: oldRefList.append( reference ) newList.append( (oldRefList,oldnormalizedWord,) ) changed = True found = True else: newList.append( (oldRefList,oldnormalizedWord,) ) if not found: #print( " Found a new", normalizedWord, "normalized word for", actualWord, "was", previous ) newList.append( ([reference],normalizedWord,) ) changed = True if changed: self.actualWordsToNormalized[actualWord] = newList #print( " now have", newList ) # File the normalized words if normalizedWord not in self.normalizedWordsToActual: self.normalizedWordsToActual[normalizedWord] = [([reference],actualWord,)] #print( "Saved", normalizedWord, "with", self.normalizedWordsToActual[normalizedWord] ) else: # we've already had this word before previous = self.normalizedWordsToActual[normalizedWord] #print( "had", normalizedWord, "before with", previous, "now with", reference, actualWord ) found = changed = False newList = [] for oldRefList,oldActualWord in previous: #print( " oRL", oldRefList, "oP", oldActualWord ) if actualWord == oldActualWord: assert( not found ) if reference not in oldRefList: oldRefList.append( reference ) newList.append( (oldRefList,oldActualWord,) ) changed = True found = True else: newList.append( (oldRefList,oldActualWord,) ) if not found: newList.append( ([reference],actualWord,) ) changed = True if changed: self.normalizedWordsToActual[normalizedWord] = newList #print( " now have", newList ) if normalizedWord not in self.normalizedWordsToParsing: self.normalizedWordsToParsing[normalizedWord] = [([reference],parsing,)] #print( "Saved", normalizedWord, "with", self.normalizedWordsToParsing[normalizedWord] ) else: # we've already had this word before previous = self.normalizedWordsToParsing[normalizedWord] #print( "had", normalizedWord, "before with", previous, "now with", reference, parsing ) found = changed = False newList = [] for oldRefList,oldParsing in previous: #print( " oRL", oldRefList, "oP", oldParsing ) if parsing == oldParsing: assert( not found ) if reference not in oldRefList: oldRefList.append( reference ) newList.append( (oldRefList,oldParsing,) ) changed = True found = True else: newList.append( (oldRefList,oldParsing,) ) if not found: newList.append( ([reference],parsing,) ) changed = True if changed: self.normalizedWordsToParsing[normalizedWord] = newList #print( " now have", newList ) # File the self.lemmasToNormalizedWords if lemma not in self.lemmasToNormalizedWords: self.lemmasToNormalizedWords[lemma] = [([reference],normalizedWord,)] #print( "Saved", lemma, "with", self.lemmasToNormalizedWords[lemma] ) else: # we've already had this word before previous = self.lemmasToNormalizedWords[lemma] #print( "had", lemma, "before with", previous, "now with", reference, normalizedWord ) found = changed = False newList = [] for oldRefList,oldnormalizedWord in previous: #print( " oRL", oldRefList, "oP", oldnormalizedWord ) if normalizedWord == oldnormalizedWord: assert( not found ) if reference not in oldRefList: oldRefList.append( reference ) newList.append( (oldRefList,oldnormalizedWord,) ) changed = True found = True else: newList.append( (oldRefList,oldnormalizedWord,) ) if not found: newList.append( ([reference],normalizedWord,) ) changed = True if changed: self.lemmasToNormalizedWords[lemma] = newList #print( " now have", newList ) if Globals.verbosityLevel > 2: print( "analyzeWords: NT has {} Greek words".format( self.wordCounts['Total'] ) ) if Globals.verbosityLevel > 2: print( "analyzeWords: NT has {} actual Greek words".format( len(self.actualWordsToNormalized) ) ) if Globals.verbosityLevel > 3: for j,aW in enumerate( self.actualWordsToNormalized.keys() ): print( " ", aW, self.actualWordsToNormalized[aW] ) if j==6: break if Globals.verbosityLevel > 2: print( "analyzeWords: NT has {} normalized Greek words".format( len(self.normalizedWordsToActual) ) ) if Globals.verbosityLevel > 3: for j,nW in enumerate( self.normalizedWordsToActual.keys() ): print( " ", nW, self.normalizedWordsToActual[nW] ) if j==6: break if Globals.verbosityLevel > 2: print( "analyzeWords: NT has {} normalized Greek words".format( len(self.normalizedWordsToParsing) ) ) if Globals.verbosityLevel > 3: for j,nW in enumerate( self.normalizedWordsToParsing.keys() ): print( " ", nW, self.normalizedWordsToParsing[nW] ) if j==6: break if Globals.verbosityLevel > 2: print( "analyzeWords: NT has {} Greek self.lemmasToNormalizedWords".format( len(self.lemmasToNormalizedWords) ) ) if Globals.verbosityLevel > 3: for j,lem in enumerate( self.lemmasToNormalizedWords.keys() ): print( " ", lem, self.lemmasToNormalizedWords[lem] ) if j==6: break if 0: print( "The following actual words have multiple normalized forms:" ) for j,aW in enumerate( self.actualWordsToNormalized.keys() ): if len(self.actualWordsToNormalized[aW])>1: print( " ", aW ) for entry in self.actualWordsToNormalized[aW]: print( " ", entry[1], self.normalizedWordsToParsing[entry[1]], entry[0] )
def load(self): """ Load a single source file and load book elements. """ if Globals.verbosityLevel > 2: print(_("Loading {}...").format(self.sourceFilepath)) fileExtensionUpper = self.fileExtension.upper() if fileExtensionUpper not in filenameEndingsToAccept: logging.critical("{} doesn't appear to be a MySword file".format( self.sourceFilename)) elif not self.sourceFilename.upper().endswith( BibleFilenameEndingsToAccept[0]): logging.critical( "{} doesn't appear to be a MySword Bible file".format( self.sourceFilename)) connection = sqlite3.connect(self.sourceFilepath) connection.row_factory = sqlite3.Row # Enable row names cursor = connection.cursor() # First get the settings cursor.execute('select * from Details') row = cursor.fetchone() for key in row.keys(): self.settingsDict[key] = row[key] #print( self.settingsDict ); halt if 'Description' in self.settingsDict and len( self.settingsDict['Description']) < 40: self.name = self.settingsDict['Description'] if 'Abbreviation' in self.settingsDict: self.abbreviation = self.settingsDict['Abbreviation'] if 'encryption' in self.settingsDict: logging.critical("{} is encrypted: level {}".format( self.sourceFilename, self.settingsDict['encryption'])) if self.settingsDict['OT'] and self.settingsDict['NT']: testament, BBB = 'BOTH', 'GEN' booksExpected, textLineCountExpected = 66, 31102 elif self.settingsDict['OT']: testament, BBB = 'OT', 'GEN' booksExpected, textLineCountExpected = 39, 23145 elif self.settingsDict['NT']: testament, BBB = 'NT', 'MAT' booksExpected, textLineCountExpected = 27, 7957 BOS = BibleOrganizationalSystem("GENERIC-KJV-66-ENG") # Create the first book thisBook = BibleBook(self.name, BBB) thisBook.objectNameString = "MySword Bible Book object" thisBook.objectTypeString = "MySword" verseList = BOS.getNumVersesList(BBB) numC, numV = len(verseList), verseList[0] nBBB = Globals.BibleBooksCodes.getReferenceNumber(BBB) C = V = 1 bookCount = 0 ourGlobals = {} continued = ourGlobals['haveParagraph'] = False haveLines = False while True: cursor.execute( 'select Scripture from Bible where Book=? and Chapter=? and Verse=?', (nBBB, C, V)) try: row = cursor.fetchone() line = row[0] except: # This reference is missing #print( "something wrong at", BBB, C, V ) #if Globals.debugFlag: halt #print( row ) line = None #print ( nBBB, BBB, C, V, 'MySw file line is "' + line + '"' ) if line is None: logging.warning( "MySwordBible.load: Found missing verse line at {} {}:{}". format(BBB, C, V)) else: # line is not None if not isinstance(line, str): if 'encryption' in self.settingsDict: logging.critical( "MySwordBible.load: Unable to decrypt verse line at {} {}:{} {}" .format(BBB, C, V, repr(line))) break else: logging.critical( "MySwordBible.load: Unable to decode verse line at {} {}:{} {} {}" .format(BBB, C, V, repr(line), self.settingsDict)) elif not line: logging.warning( "MySwordBible.load: Found blank verse line at {} {}:{}" .format(BBB, C, V)) else: haveLines = True # Some modules end lines with \r\n or have it in the middle! # (We just ignore these for now) if '\r' in line or '\n' in line: logging.warning( "MySwordBible.load: Found CR or LF characters in verse line at {} {}:{}" .format(BBB, C, V)) while line and line[-1] in '\r\n': line = line[:-1] line = line.replace('\r\n', ' ').replace('\r', ' ').replace('\n', ' ') #print( "MySword.load", BBB, C, V, repr(line) ) handleLine(self.name, BBB, C, V, line, thisBook, ourGlobals) V += 1 if V > numV: C += 1 if C > numC: # Save this book now if haveLines: if Globals.verbosityLevel > 3: print("Saving", BBB, bookCount + 1) self.saveBook(thisBook) #else: print( "Not saving", BBB ) bookCount += 1 # Not the number saved but the number we attempted to process if bookCount >= booksExpected: break BBB = BOS.getNextBookCode(BBB) # Create the next book thisBook = BibleBook(self.name, BBB) thisBook.objectNameString = "MySword Bible Book object" thisBook.objectTypeString = "MySword" haveLines = False verseList = BOS.getNumVersesList(BBB) numC, numV = len(verseList), verseList[0] nBBB = Globals.BibleBooksCodes.getReferenceNumber(BBB) C = V = 1 #thisBook.appendLine( 'c', str(C) ) else: # next chapter only #thisBook.appendLine( 'c', str(C) ) numV = verseList[C - 1] V = 1 if ourGlobals['haveParagraph']: thisBook.appendLine('p', '') ourGlobals['haveParagraph'] = False cursor.close() self.doPostLoadProcessing()