def load( self, filename, folder=None, encoding='utf-8' ): """ Load the USFM Bible book from a file. Tries to combine physical lines into logical lines, i.e., so that all lines begin with a USFM paragraph marker. Uses the appendLine function of the base class to save the lines. Note: the base class later on will try to break apart lines with a paragraph marker in the middle -- we don't need to worry about that here. """ def doAppendLine( marker, text ): """ Check for newLine markers within the line (if so, break the line) and save the information in our database. """ #originalMarker, originalText = marker, text # Only needed for the debug print line below #print( "doAppendLine( {}, {} )".format( repr(marker), repr(text) ) ) if '\\' in text: # Check markers inside the lines markerList = Globals.USFMMarkers.getMarkerListFromText( text ) ix = 0 for insideMarker, iMIndex, nextSignificantChar, fullMarker, characterContext, endIndex, markerField in markerList: # check paragraph markers if insideMarker == '\\': # it's a free-standing backspace loadErrors.append( _("{} {}:{} Improper free-standing backspace character within line in \\{}: '{}'").format( self.bookReferenceCode, c, v, marker, text ) ) logging.error( _("Improper free-standing backspace character within line after {} {}:{} in \\{}: '{}'").format( self.bookReferenceCode, c, v, marker, text ) ) # Only log the first error in the line self.addPriorityError( 100, c, v, _("Improper free-standing backspace character inside a line") ) elif Globals.USFMMarkers.isNewlineMarker(insideMarker): # Need to split the line for everything else to work properly if ix==0: loadErrors.append( _("{} {}:{} NewLine marker '{}' shouldn't appear within line in \\{}: '{}'").format( self.bookReferenceCode, c, v, insideMarker, marker, text ) ) logging.error( _("NewLine marker '{}' shouldn't appear within line after {} {}:{} in \\{}: '{}'").format( insideMarker, self.bookReferenceCode, c, v, marker, text ) ) # Only log the first error in the line self.addPriorityError( 96, c, v, _("NewLine marker \\{} shouldn't be inside a line").format( insideMarker ) ) thisText = text[ix:iMIndex].rstrip() self.appendLine( marker, thisText ) ix = iMIndex + 1 + len(insideMarker) + len(nextSignificantChar) # Get the start of the next text -- the 1 is for the backslash #print( "Did a split from {}:'{}' to {}:'{}' leaving {}:'{}'".format( originalMarker, originalText, marker, thisText, insideMarker, text[ix:] ) ) marker = insideMarker # setup for the next line if ix != 0: # We must have separated multiple lines text = text[ix:] # Get the final bit of the line self.appendLine( marker, text ) # Call the function in the base class to save the line (or the remainder of the line if we split it above) # end of doAppendLine if Globals.verbosityLevel > 2: print( " " + _("Loading {}...").format( filename ) ) #self.bookReferenceCode = bookReferenceCode #self.isSingleChapterBook = Globals.BibleBooksCodes.isSingleChapterBook( bookReferenceCode ) self.sourceFilename = filename self.sourceFolder = folder self.sourceFilepath = os.path.join( folder, filename ) if folder else filename originalBook = USFMFile() originalBook.read( self.sourceFilepath, encoding=encoding ) # Do some important cleaning up before we save the data c = v = '0' lastMarker = lastText = '' loadErrors = [] for marker,text in originalBook.lines: # Always process a line behind in case we have to combine lines #print( "After {} {}:{} \\{} '{}'".format( bookReferenceCode, c, v, marker, text ) ) # Keep track of where we are for more helpful error messages if marker=='c' and text: c, v = text.split()[0], '0' elif marker=='v' and text: v = text.split()[0] if c == '0': c = '1' # Some single chapter books don't have an explicit chapter 1 marker elif marker=='restore': continue # Ignore these lines completely # Now load the actual Bible book data if Globals.USFMMarkers.isNewlineMarker( marker ): if lastMarker: doAppendLine( lastMarker, lastText ) lastMarker, lastText = marker, text elif Globals.USFMMarkers.isInternalMarker( marker ) \ or marker.endswith('*') and Globals.USFMMarkers.isInternalMarker( marker[:-1] ): # the line begins with an internal marker -- append it to the previous line if text: loadErrors.append( _("{} {}:{} Found '\\{}' internal marker at beginning of line with text: {}").format( self.bookReferenceCode, c, v, marker, text ) ) logging.warning( _("Found '\\{}' internal marker after {} {}:{} at beginning of line with text: {}").format( marker, self.bookReferenceCode, c, v, text ) ) else: # no text loadErrors.append( _("{} {}:{} Found '\\{}' internal marker at beginning of line (with no text)").format( self.bookReferenceCode, c, v, marker ) ) logging.warning( _("Found '\\{}' internal marker after {} {}:{} at beginning of line (with no text)").format( marker, self.bookReferenceCode, c, v ) ) self.addPriorityError( 27, c, v, _("Found \\{} internal marker on new line in file").format( marker ) ) if not lastText.endswith(' '): lastText += ' ' # Not always good to add a space, but it's their fault! lastText += '\\' + marker + ' ' + text if Globals.verbosityLevel > 3: print( "{} {} {} Appended {}:'{}' to get combined line {}:'{}'".format( self.bookReferenceCode, c, v, marker, text, lastMarker, lastText ) ) elif Globals.USFMMarkers.isNoteMarker( marker ) \ or marker.endswith('*') and Globals.USFMMarkers.isNoteMarker( marker[:-1] ): # the line begins with a note marker -- append it to the previous line if text: loadErrors.append( _("{} {}:{} Found '\\{}' note marker at beginning of line with text: {}").format( self.bookReferenceCode, c, v, marker, text ) ) logging.warning( _("Found '\\{}' note marker after {} {}:{} at beginning of line with text: {}").format( marker, self.bookReferenceCode, c, v, text ) ) else: # no text loadErrors.append( _("{} {}:{} Found '\\{}' note marker at beginning of line (with no text)").format( self.bookReferenceCode, c, v, marker ) ) logging.warning( _("Found '\\{}' note marker after {} {}:{} at beginning of line (with no text)").format( marker, self.bookReferenceCode, c, v ) ) self.addPriorityError( 26, c, v, _("Found \\{} note marker on new line in file").format( marker ) ) if not lastText.endswith(' ') and marker!='f': lastText += ' ' # Not always good to add a space, but it's their fault! Don't do it for footnotes, though. lastText += '\\' + marker + ' ' + text if Globals.verbosityLevel > 3: print( "{} {} {} Appended {}:'{}' to get combined line {}:'{}'".format( self.bookReferenceCode, c, v, marker, text, lastMarker, lastText ) ) else: # the line begins with an unknown marker if text: loadErrors.append( _("{} {}:{} Found '\\{}' unknown marker at beginning of line with text: {}").format( self.bookReferenceCode, c, v, marker, text ) ) logging.error( _("Found '\\{}' unknown marker after {} {}:{} at beginning of line with text: {}").format( marker, self.bookReferenceCode, c, v, text ) ) else: # no text loadErrors.append( _("{} {}:{} Found '\\{}' unknown marker at beginning of line (with no text").format( self.bookReferenceCode, c, v, marker ) ) logging.error( _("Found '\\{}' unknown marker after {} {}:{} at beginning of line (with no text)").format( marker, self.bookReferenceCode, c, v ) ) self.addPriorityError( 100, c, v, _("Found \\{} unknown marker on new line in file").format( marker ) ) for tryMarker in sortedNLMarkers: # Try to do something intelligent here -- it might be just a missing space if marker.startswith( tryMarker ): # Let's try changing it if lastMarker: doAppendLine( lastMarker, lastText ) lastMarker, lastText = tryMarker, marker[len(tryMarker):] + ' ' + text loadErrors.append( _("{} {}:{} Changed '\\{}' unknown marker to '{}' at beginning of line: {}").format( self.bookReferenceCode, c, v, marker, tryMarker, text ) ) logging.warning( _("Changed '\\{}' unknown marker to '{}' after {} {}:{} at beginning of line: {}").format( marker, tryMarker, self.bookReferenceCode, c, v, text ) ) break # Otherwise, don't bother processing this line -- it'll just cause more problems later on if lastMarker: doAppendLine( lastMarker, lastText ) # Process the final line if not originalBook.lines: # There were no lines!!! loadErrors.append( _("{} This USFM file was totally empty: {}").format( self.bookReferenceCode, self.sourceFilename ) ) logging.error( _("USFM file for {} was totally empty: {}").format( self.bookReferenceCode, self.sourceFilename ) ) lastMarker, lastText = 'rem', 'This (USFM) file was completely empty' # Save something since we had a file at least if loadErrors: self.errorDictionary['Load Errors'] = loadErrors
def load(self, filename, folder=None, encoding=None): """ Load the USFM Bible book from a file. Tries to combine physical lines into logical lines, i.e., so that all lines begin with a USFM paragraph marker. Uses the addLine function of the base class to save the lines. Note: the base class later on will try to break apart lines with a paragraph marker in the middle -- we don't need to worry about that here. """ def doaddLine(originalMarker, originalText): """ Check for newLine markers within the line (if so, break the line) and save the information in our database. Also convert ~ to a proper non-break space. """ #print( "doaddLine( {!r}, {!r} )".format( originalMarker, originalText ) ) marker, text = originalMarker, originalText.replace('~', ' ') if '\\' in text: # Check markers inside the lines markerList = BibleOrgSysGlobals.USFMMarkers.getMarkerListFromText( text) ix = 0 for insideMarker, iMIndex, nextSignificantChar, fullMarker, characterContext, endIndex, markerField in markerList: # check paragraph markers if insideMarker == '\\': # it's a free-standing backspace loadErrors.append( _("{} {}:{} Improper free-standing backspace character within line in \\{}: {!r}" ).format(self.BBB, C, V, marker, text)) logging.error( _("Improper free-standing backspace character within line after {} {}:{} in \\{}: {!r}" ).format(self.BBB, C, V, marker, text) ) # Only log the first error in the line self.addPriorityError( 100, C, V, _("Improper free-standing backspace character inside a line" )) elif BibleOrgSysGlobals.USFMMarkers.isNewlineMarker( insideMarker ): # Need to split the line for everything else to work properly if ix == 0: loadErrors.append( _("{} {}:{} NewLine marker {!r} shouldn't appear within line in \\{}: {!r}" ).format(self.BBB, C, V, insideMarker, marker, text)) logging.error( _("NewLine marker {!r} shouldn't appear within line after {} {}:{} in \\{}: {!r}" ).format(insideMarker, self.BBB, C, V, marker, text) ) # Only log the first error in the line self.addPriorityError( 96, C, V, _("NewLine marker \\{} shouldn't be inside a line" ).format(insideMarker)) thisText = text[ix:iMIndex].rstrip() self.addLine(marker, thisText) ix = iMIndex + 1 + len(insideMarker) + len( nextSignificantChar ) # Get the start of the next text -- the 1 is for the backslash #print( "Did a split from {}:{!r} to {}:{!r} leaving {}:{!r}".format( originalMarker, originalText, marker, thisText, insideMarker, text[ix:] ) ) marker = insideMarker # setup for the next line if ix != 0: # We must have separated multiple lines text = text[ix:] # Get the final bit of the line self.addLine( marker, text ) # Call the function in the base class to save the line (or the remainder of the line if we split it above) # end of doaddLine # Main code for USFMBibleBook.load() if BibleOrgSysGlobals.verbosityLevel > 2: print(" " + _("Loading {}…").format(filename)) #self.BBB = BBB #self.isSingleChapterBook = BibleOrgSysGlobals.BibleBooksCodes.isSingleChapterBook( BBB ) self.sourceFilename = filename self.sourceFolder = folder self.sourceFilepath = os.path.join(folder, filename) if folder else filename originalBook = USFMFile() if encoding is None: encoding = 'utf-8' originalBook.read(self.sourceFilepath, encoding=encoding) # Do some important cleaning up before we save the data C, V = '-1', '-1' # So first/id line starts at -1:0 lastMarker = lastText = '' loadErrors = [] for marker, text in originalBook.lines: # Always process a line behind in case we have to combine lines #print( "After {} {}:{} \\{} {!r}".format( self.BBB, C, V, marker, text ) ) # Keep track of where we are for more helpful error messages if marker == 'c' and text: #print( "bits", text.split() ) try: C = text.split()[0] except IndexError: # Seems we had a \c field that's just whitespace loadErrors.append( _("{} {}:{} Found {!r} invalid chapter field") \ .format( self.BBB, C, V, text ) ) logging.critical( _("Found {!r} invalid chapter field after {} {}:{}") \ .format( text, self.BBB, C, V ) ) self.addPriorityError( 100, C, V, _("Found invalid/empty chapter field in file")) V = '0' elif marker == 'v' and text: newV = text.split()[0] if V == '0' and not (newV == '1' or newV.startswith('1-')): loadErrors.append( _("{} {}:{} Expected v1 after chapter marker not {!r}") \ .format( self.BBB, C, V, newV ) ) logging.error( _("Unexpected {!r} verse number immediately after chapter field after {} {}:{}") \ .format( newV, self.BBB, C, V ) ) self.addPriorityError(100, C, V, _("Got unexpected chapter number")) V = newV if C == '-1': C = '1' # Some single chapter books don't have an explicit chapter 1 marker elif C == '-1' and marker != 'intro': V = str(int(V) + 1) elif marker == 'restore': continue # Ignore these lines completely # Now load the actual Bible book data if BibleOrgSysGlobals.USFMMarkers.isNewlineMarker(marker): if lastMarker: doaddLine(lastMarker, lastText) lastMarker, lastText = marker, text elif BibleOrgSysGlobals.USFMMarkers.isInternalMarker( marker ) \ or marker.endswith('*') and BibleOrgSysGlobals.USFMMarkers.isInternalMarker( marker[:-1] ): # the line begins with an internal marker -- append it to the previous line if text: loadErrors.append( _("{} {}:{} Found '\\{}' internal marker at beginning of line with text: {!r}" ).format(self.BBB, C, V, marker, text)) logging.warning( _("Found '\\{}' internal marker after {} {}:{} at beginning of line with text: {!r}" ).format(marker, self.BBB, C, V, text)) else: # no text loadErrors.append( _("{} {}:{} Found '\\{}' internal marker at beginning of line (with no text)" ).format(self.BBB, C, V, marker)) logging.warning( _("Found '\\{}' internal marker after {} {}:{} at beginning of line (with no text)" ).format(marker, self.BBB, C, V)) self.addPriorityError( 27, C, V, _("Found \\{} internal marker on new line in file").format( marker)) if not lastText.endswith(' '): lastText += ' ' # Not always good to add a space, but it's their fault! lastText += '\\' + marker + ' ' + text if BibleOrgSysGlobals.verbosityLevel > 3: print( "{} {} {} Appended {}:{!r} to get combined line {}:{!r}" .format(self.BBB, C, V, marker, text, lastMarker, lastText)) elif BibleOrgSysGlobals.USFMMarkers.isNoteMarker( marker ) \ or marker.endswith('*') and BibleOrgSysGlobals.USFMMarkers.isNoteMarker( marker[:-1] ): # the line begins with a note marker -- append it to the previous line if text: loadErrors.append( _("{} {}:{} Found '\\{}' note marker at beginning of line with text: {!r}" ).format(self.BBB, C, V, marker, text)) logging.warning( _("Found '\\{}' note marker after {} {}:{} at beginning of line with text: {!r}" ).format(marker, self.BBB, C, V, text)) else: # no text loadErrors.append( _("{} {}:{} Found '\\{}' note marker at beginning of line (with no text)" ).format(self.BBB, C, V, marker)) logging.warning( _("Found '\\{}' note marker after {} {}:{} at beginning of line (with no text)" ).format(marker, self.BBB, C, V)) self.addPriorityError( 26, C, V, _("Found \\{} note marker on new line in file").format( marker)) if not lastText.endswith(' ') and marker != 'f': lastText += ' ' # Not always good to add a space, but it's their fault! Don't do it for footnotes, though. lastText += '\\' + marker + ' ' + text if BibleOrgSysGlobals.verbosityLevel > 3: print( "{} {} {} Appended {}:{!r} to get combined line {}:{!r}" .format(self.BBB, C, V, marker, text, lastMarker, lastText)) else: # the line begins with an unknown marker if marker and marker[0] == 'z': # it's a custom marker if text: loadErrors.append( _("{} {}:{} Found '\\{}' unknown custom marker at beginning of line with text: {!r}") \ .format( self.BBB, C, V, marker, text ) ) logging.warning( _("Found '\\{}' unknown custom marker after {} {}:{} at beginning of line with text: {!r}") \ .format( marker, self.BBB, C, V, text ) ) else: # no text loadErrors.append( _("{} {}:{} Found '\\{}' unknown custom marker at beginning of line (with no text") \ .format( self.BBB, C, V, marker ) ) logging.warning( _("Found '\\{}' unknown custom marker after {} {}:{} at beginning of line (with no text)") \ .format( marker, self.BBB, C, V ) ) self.addPriorityError( 80, C, V, _("Found \\{} unknown custom marker on new line in file" ).format(marker)) else: # it's an unknown marker if text: loadErrors.append( _("{} {}:{} Found '\\{}' unknown marker at beginning of line with text: {!r}") \ .format( self.BBB, C, V, marker, text ) ) logging.error( _("Found '\\{}' unknown marker after {} {}:{} at beginning of line with text: {!r}") \ .format( marker, self.BBB, C, V, text ) ) else: # no text loadErrors.append( _("{} {}:{} Found '\\{}' unknown marker at beginning of line (with no text") \ .format( self.BBB, C, V, marker ) ) logging.error( _("Found '\\{}' unknown marker after {} {}:{} at beginning of line (with no text)") \ .format( marker, self.BBB, C, V ) ) self.addPriorityError( 100, C, V, _("Found \\{} unknown marker on new line in file"). format(marker)) for tryMarker in sortedNLMarkers: # Try to do something intelligent here -- it might be just a missing space if marker.startswith( tryMarker): # Let's try changing it if lastMarker: doaddLine(lastMarker, lastText) if marker == 's5' and not text: # Door43 projects use empty s5 fields as some kind of division markers lastMarker, lastText = 's', '---' else: # Move the extra appendage to the marker into the actual text lastMarker, lastText = tryMarker, marker[ len(tryMarker):] + ' ' + text if text: loadErrors.append( _("{} {}:{} Changed '\\{}' unknown marker to {!r} at beginning of line: {}" ).format(self.BBB, C, V, marker, tryMarker, text)) logging.warning( _("Changed '\\{}' unknown marker to {!r} after {} {}:{} at beginning of line: {}" ).format(marker, tryMarker, self.BBB, C, V, text)) else: loadErrors.append( _("{} {}:{} Changed '\\{}' unknown marker to {!r} at beginning of otherwise empty line" ).format(self.BBB, C, V, marker, tryMarker)) logging.warning( _("Changed '\\{}' unknown marker to {!r} after {} {}:{} at beginning of otherwise empty line" ).format(marker, tryMarker, self.BBB, C, V)) break # Otherwise, don't bother processing this line -- it'll just cause more problems later on if lastMarker: doaddLine(lastMarker, lastText) # Process the final line if not originalBook.lines: # There were no lines!!! loadErrors.append( _("{} This USFM file was totally empty: {}").format( self.BBB, self.sourceFilename)) logging.error( _("USFM file for {} was totally empty: {}").format( self.BBB, self.sourceFilename)) lastMarker, lastText = 'rem', 'This (USFM) file was completely empty' # Save something since we had a file at least if loadErrors: self.errorDictionary['Load Errors'] = loadErrors
def USFMBookCompare(filepath1, filepath2, file1Name='file1', file2Name='file2'): """ """ if BibleOrgSysGlobals.verbosityLevel > 2: print("\nUSFMBookCompare() for USFM Bible books") if BibleOrgSysGlobals.verbosityLevel > 3: print(" comparing {}".format(filepath1)) print(" and {}".format(filepath2)) # Set up empty results dictionaries resultDict = OrderedDict() resultDict['File1'], resultDict['File2'] = OrderedDict(), OrderedDict() resultDict['Same'], resultDict['Different'], resultDict[ 'Summary'] = OrderedDict(), OrderedDict(), OrderedDict() # Note paths and folders resultDict['File1']['Filepath'], resultDict['File2'][ 'Filepath'] = filepath1, filepath2 resultDict['File1']['Folder'], resultDict['File1'][ 'Filename'] = os.path.split(filepath1) resultDict['File2']['Folder'], resultDict['File2'][ 'Filename'] = os.path.split(filepath2) if resultDict['File2']['Filename'] == resultDict['File1']['Filename']: resultDict['Same']['Filename'] = resultDict['File1']['Filename'] if resultDict['File2']['Folder'] == resultDict['File1']['Folder']: resultDict['Same']['Folder'] = resultDict['File1']['Folder'] # Note file dates and sizes s1, s2 = os.stat(filepath1), os.stat(filepath2) resultDict['File1']['Filesize'], resultDict['File2'][ 'Filesize'] = s1.st_size, s2.st_size if resultDict['File1']['Filesize'] == resultDict['File2']['Filesize']: resultDict['Same']['Filesize'] = resultDict['File1']['Filesize'] else: resultDict['Different']['Filesize'] = (resultDict['File1']['Filesize'], resultDict['File2']['Filesize']) if s1.st_size > s2.st_size: resultDict['Summary'][ 'Filesize'] = "{} is bigger (by {:,} bytes)".format( file1Name, s1.st_size - s2.st_size) elif s1.st_size < s2.st_size: resultDict['Summary'][ 'Filesize'] = "{} is bigger (by {:,} bytes)".format( file2Name, s2.st_size - s1.st_size) resultDict['File1']['ModifiedTimeStamp'], resultDict['File2'][ 'ModifiedTimeStamp'] = s1.st_mtime, s2.st_mtime if s1.st_mtime - s2.st_mtime > 1.0: resultDict['Summary']['ModifiedTime'] = "{} is newer".format(file1Name) elif s2.st_mtime - s1.st_mtime > 1.0: resultDict['Summary']['ModifiedTime'] = "{} is newer".format(file2Name) t1, t2 = datetime.fromtimestamp(s1.st_mtime), datetime.fromtimestamp( s2.st_mtime) resultDict['File1']['ModifiedDate'], resultDict['File2'][ 'ModifiedDate'] = t1.strftime('%Y-%m-%d'), t2.strftime('%Y-%m-%d') if resultDict['File1']['ModifiedDate'] == resultDict['File2'][ 'ModifiedDate']: resultDict['Same']['ModifiedDate'] = resultDict['File1'][ 'ModifiedDate'] else: resultDict['Different']['ModifiedDate'] = ( resultDict['File1']['ModifiedDate'], resultDict['File2']['ModifiedDate']) resultDict['File1']['ModifiedTime'], resultDict['File2'][ 'ModifiedTime'] = t1.strftime('%H:%M:%S'), t2.strftime('%H:%M:%S') if resultDict['File1']['ModifiedTime'] == resultDict['File2'][ 'ModifiedTime']: resultDict['Same']['ModifiedTime'] = resultDict['File1'][ 'ModifiedTime'] else: resultDict['Different']['ModifiedTime'] = ( resultDict['File1']['ModifiedTime'], resultDict['File2']['ModifiedTime']) # Read the files uf1, uf2 = USFMFile(), USFMFile() uf1.read(filepath1) uf2.read(filepath2) #print( 'f1', uf1.lines ) #print( 'f2', uf2.lines ) # Note line counts resultDict['File1']['LineCount'], resultDict['File2']['LineCount'] = len( uf1.lines), len(uf2.lines) if resultDict['File1']['LineCount'] == resultDict['File2']['LineCount']: resultDict['Same']['LineCount'] = resultDict['File1']['LineCount'] else: resultDict['Different']['LineCount'] = ( resultDict['File1']['LineCount'], resultDict['File2']['LineCount']) # Work through each file counting chapters and verses, etc. resultDict['File1']['IntroLineCount'] = resultDict['File2'][ 'IntroLineCount'] = 0 resultDict['File1']['ChapterMarkerCount'] = resultDict['File2'][ 'ChapterMarkerCount'] = 0 resultDict['File1']['VerseMarkerCount'] = resultDict['File2'][ 'VerseMarkerCount'] = 0 resultDict['File1']['HasContentCount'] = resultDict['File2'][ 'HasContentCount'] = 0 startedCVs = False lastC = lastV = 0 C, V = '-1', '-1' # So first/id line starts at -1:0 for marker, line in uf1.lines: #print( '1', C, V, lastC, lastV, marker, line ) if marker == 'c': resultDict['File1']['ChapterMarkerCount'] += 1 C, V, lastV = line.strip(), '0', 0 try: intC = int(C) except ValueError: intC = -2 # invalid value startedCVs = True if intC != lastC + 1: if 'File1Chapters' not in resultDict[ 'Summary']: # only record the first one resultDict['Summary'][ 'File1Chapters'] = "{} has chapters out of order ({} after {})".format( file1Name, C, lastC) lastC = intC elif marker == 'v': resultDict['File1']['VerseMarkerCount'] += 1 V = line.strip().split()[0] if '-' in V: # it's a verse bridge V, V2 = V.split('-', 1) else: V2 = None try: intV = int(V) except ValueError: intV = -1 startedCVs = True # Some one chapter books don't include a chapter marker if intV != lastV + 1: if 'File1Verses' not in resultDict[ 'Summary']: # only record the first one resultDict['Summary'][ 'File1Verses'] = "{} has verses out of order ({}:{} after {}:{})".format( file1Name, C, V, C, lastV) if V2: lastV = int(V2) else: lastV = intV if not startedCVs: resultDict['File1']['IntroLineCount'] += 1 if line.strip(): resultDict['File1']['HasContentCount'] += 1 if '<<<<' in line or '====' in line or '>>>>' in line: if 'File1Conflicts' not in resultDict[ 'Summary']: # only record the first one resultDict['Summary'][ 'File1Conflicts'] = "{} may have a merge conflict around {}:{}".format( file1Name, C, V) startedCVs = False lastC = lastV = 0 C, V = '-1', '-1' # So first/id line starts at -1:0 for marker, line in uf2.lines: #print( '1', C, V, lastC, lastV, marker, line ) if marker == 'c': resultDict['File2']['ChapterMarkerCount'] += 1 C, V, lastV = line.strip(), '0', 0 try: intC = int(C) except ValueError: intC = -2 # invalid value startedCVs = True if intC != lastC + 1: if 'File2Chapters' not in resultDict[ 'Summary']: # only record the first one resultDict['Summary'][ 'File2Chapters'] = "{} has chapters out of order ({} after {})".format( file2Name, C, lastC) lastC = intC elif marker == 'v': resultDict['File2']['VerseMarkerCount'] += 1 V = line.strip().split()[0] if '-' in V: # it's a verse bridge V, V2 = V.split('-', 1) else: V2 = None try: intV = int(V) except ValueError: intV = -1 startedCVs = True # Some one chapter books don't include a chapter marker if intV != lastV + 1: if 'File2Verses' not in resultDict[ 'Summary']: # only record the first one resultDict['Summary'][ 'File2Verses'] = "{} has verses out of order ({}:{} after {}:{})".format( file2Name, C, V, C, lastV) if V2: lastV = int(V2) else: lastV = intV if not startedCVs: resultDict['File2']['IntroLineCount'] += 1 if line.strip(): resultDict['File2']['HasContentCount'] += 1 if '<<<<' in line or '====' in line or '>>>>' in line: if 'File2Conflicts' not in resultDict[ 'Summary']: # only record the first one resultDict['Summary'][ 'File2Conflicts'] = "{} may have a merge conflict around {}:{}".format( file2Name, C, V) if resultDict['File1']['IntroLineCount'] == resultDict['File2'][ 'IntroLineCount']: resultDict['Same']['IntroLineCount'] = resultDict['File1'][ 'IntroLineCount'] else: resultDict['Different']['IntroLineCount'] = ( resultDict['File1']['IntroLineCount'], resultDict['File2']['IntroLineCount']) if resultDict['File1']['IntroLineCount'] > resultDict['File2'][ 'IntroLineCount']: difference = resultDict['File1']['IntroLineCount'] - resultDict[ 'File2']['IntroLineCount'] resultDict['Summary'][ 'IntroLineCount'] = "{} has {} more intro marker{}".format( file1Name, difference, '' if difference == 1 else 's') elif resultDict['File1']['IntroLineCount'] < resultDict['File2'][ 'IntroLineCount']: difference = resultDict['File2']['IntroLineCount'] - resultDict[ 'File1']['IntroLineCount'] resultDict['Summary'][ 'IntroLineCount'] = "{} has {} more intro marker{}".format( file2Name, difference, '' if difference == 1 else 's') if resultDict['File1']['ChapterMarkerCount'] == resultDict['File2'][ 'ChapterMarkerCount']: resultDict['Same']['ChapterMarkerCount'] = resultDict['File1'][ 'ChapterMarkerCount'] else: resultDict['Different']['ChapterMarkerCount'] = ( resultDict['File1']['ChapterMarkerCount'], resultDict['File2']['ChapterMarkerCount']) if resultDict['File1']['ChapterMarkerCount'] > resultDict['File2'][ 'ChapterMarkerCount']: difference = resultDict['File1']['ChapterMarkerCount'] - resultDict[ 'File2']['ChapterMarkerCount'] resultDict['Summary'][ 'ChapterMarkerCount'] = "{} has {} more chapter marker{}".format( file1Name, ) elif resultDict['File1']['ChapterMarkerCount'] < resultDict['File2'][ 'ChapterMarkerCount']: difference = resultDict['File2']['ChapterMarkerCount'] - resultDict[ 'File1']['ChapterMarkerCount'] resultDict['Summary'][ 'ChapterMarkerCount'] = "{} has {} more chapter marker{}".format( file2Name, difference, '' if difference == 1 else 's') if resultDict['File1']['VerseMarkerCount'] == resultDict['File2'][ 'VerseMarkerCount']: resultDict['Same']['VerseMarkerCount'] = resultDict['File1'][ 'VerseMarkerCount'] else: resultDict['Different']['VerseMarkerCount'] = ( resultDict['File1']['VerseMarkerCount'], resultDict['File2']['VerseMarkerCount']) if resultDict['File1']['VerseMarkerCount'] > resultDict['File2'][ 'VerseMarkerCount']: difference = resultDict['File1']['VerseMarkerCount'] - resultDict[ 'File2']['VerseMarkerCount'] resultDict['Summary'][ 'VerseMarkerCount'] = "{} has {} more verse marker{}".format( file1Name, difference, '' if difference == 1 else 's') elif resultDict['File1']['VerseMarkerCount'] < resultDict['File2'][ 'VerseMarkerCount']: difference = resultDict['File2']['VerseMarkerCount'] - resultDict[ 'File1']['VerseMarkerCount'] resultDict['Summary'][ 'VerseMarkerCount'] = "{} has {} more verse marker{}".format( file2Name, difference, '' if difference == 1 else 's') if resultDict['File1']['HasContentCount'] == resultDict['File2'][ 'HasContentCount']: resultDict['Same']['HasContentCount'] = resultDict['File1'][ 'HasContentCount'] else: resultDict['Different']['HasContentCount'] = ( resultDict['File1']['HasContentCount'], resultDict['File2']['HasContentCount']) if resultDict['File1']['HasContentCount'] > resultDict['File2'][ 'HasContentCount']: difference = resultDict['File1']['HasContentCount'] - resultDict[ 'File2']['HasContentCount'] resultDict['Summary'][ 'HasContentCount'] = "{} has {} more content line{}".format( file1Name, difference, '' if difference == 1 else 's') elif resultDict['File1']['HasContentCount'] < resultDict['File2'][ 'HasContentCount']: difference = resultDict['File2']['HasContentCount'] - resultDict[ 'File1']['HasContentCount'] resultDict['Summary'][ 'HasContentCount'] = "{} has {} more content line{}".format( file2Name, difference, '' if difference == 1 else 's') # Work through the files again comparing lines # Trying to resync if there's a different number of lines…NOT FINISHED YET XXXXXXXXXXXXXXX resultDict['Same']['SameMarkerCount'] = resultDict['Different'][ 'DifferentMarkerCount'] = 0 resultDict['Same']['SameLineCount'] = resultDict['Different'][ 'DifferentLineCount'] = 0 lineIndex = lineOffset = 0 startedCVs1 = startedCVs2 = False while True: if lineIndex >= resultDict['File1']['LineCount']: if BibleOrgSysGlobals.debugFlag and debuggingThisModule: print("File1 done") break if lineIndex >= resultDict['File2']['LineCount']: if BibleOrgSysGlobals.debugFlag and debuggingThisModule: print("File2 done") break (m1, l1), (m2, l2) = uf1.lines[lineIndex], uf2.lines[lineIndex + lineOffset] #print( lineIndex, lineOffset, m1, m2 ) if m1 == m2: resultDict['Same']['SameMarkerCount'] += 1 else: if BibleOrgSysGlobals.debugFlag: print("Diff", m1, m2, l1, l2) resultDict['Different']['DifferentMarkerCount'] += 1 if m1 == m2 and l1 == l2: resultDict['Same']['SameLineCount'] += 1 else: if BibleOrgSysGlobals.debugFlag: print("Diff", m1, m2, l1, l2) resultDict['Different']['DifferentLineCount'] += 1 lineIndex += 1 # Clean up and return for something, value in list(resultDict['Different'].items()): if not value: del resultDict['Different'][something] return resultDict
def USFMBookCompare( filepath1, filepath2, file1Name='file1', file2Name='file2' ): """ """ if BibleOrgSysGlobals.verbosityLevel > 2: print( "\nUSFMBookCompare() for USFM Bible books" ) if BibleOrgSysGlobals.verbosityLevel > 3: print( " comparing {}".format( filepath1 ) ) print( " and {}".format( filepath2 ) ) # Set up empty results dictionaries resultDict = OrderedDict() resultDict['File1'], resultDict['File2'] = OrderedDict(), OrderedDict() resultDict['Same'], resultDict['Different'], resultDict['Summary'] = OrderedDict(), OrderedDict(), OrderedDict() # Note paths and folders resultDict['File1']['Filepath'], resultDict['File2']['Filepath'] = filepath1, filepath2 resultDict['File1']['Folder'], resultDict['File1']['Filename'] = os.path.split( filepath1 ) resultDict['File2']['Folder'], resultDict['File2']['Filename'] = os.path.split( filepath2 ) if resultDict['File2']['Filename'] == resultDict['File1']['Filename']: resultDict['Same']['Filename'] = resultDict['File1']['Filename'] if resultDict['File2']['Folder'] == resultDict['File1']['Folder']: resultDict['Same']['Folder'] = resultDict['File1']['Folder'] # Note file dates and sizes s1, s2 = os.stat( filepath1 ), os.stat( filepath2 ) resultDict['File1']['Filesize'], resultDict['File2']['Filesize'] = s1.st_size, s2.st_size if resultDict['File1']['Filesize'] == resultDict['File2']['Filesize']: resultDict['Same']['Filesize'] = resultDict['File1']['Filesize'] else: resultDict['Different']['Filesize'] = (resultDict['File1']['Filesize'],resultDict['File2']['Filesize']) if s1.st_size > s2.st_size: resultDict['Summary']['Filesize'] = "{} is bigger (by {:,} bytes)".format( file1Name, s1.st_size - s2.st_size ) elif s1.st_size < s2.st_size: resultDict['Summary']['Filesize'] = "{} is bigger (by {:,} bytes)".format( file2Name, s2.st_size - s1.st_size ) resultDict['File1']['ModifiedTimeStamp'], resultDict['File2']['ModifiedTimeStamp'] = s1.st_mtime, s2.st_mtime if s1.st_mtime - s2.st_mtime > 1.0: resultDict['Summary']['ModifiedTime'] = "{} is newer".format( file1Name ) elif s2.st_mtime - s1.st_mtime > 1.0: resultDict['Summary']['ModifiedTime'] = "{} is newer".format( file2Name ) t1, t2 = datetime.fromtimestamp( s1.st_mtime ), datetime.fromtimestamp( s2.st_mtime ) resultDict['File1']['ModifiedDate'], resultDict['File2']['ModifiedDate'] = t1.strftime('%Y-%m-%d'), t2.strftime('%Y-%m-%d') if resultDict['File1']['ModifiedDate'] == resultDict['File2']['ModifiedDate']: resultDict['Same']['ModifiedDate'] = resultDict['File1']['ModifiedDate'] else: resultDict['Different']['ModifiedDate'] = (resultDict['File1']['ModifiedDate'],resultDict['File2']['ModifiedDate']) resultDict['File1']['ModifiedTime'], resultDict['File2']['ModifiedTime'] = t1.strftime('%H:%M:%S'), t2.strftime('%H:%M:%S') if resultDict['File1']['ModifiedTime'] == resultDict['File2']['ModifiedTime']: resultDict['Same']['ModifiedTime'] = resultDict['File1']['ModifiedTime'] else: resultDict['Different']['ModifiedTime'] = (resultDict['File1']['ModifiedTime'],resultDict['File2']['ModifiedTime']) # Read the files uf1, uf2 = USFMFile(), USFMFile() uf1.read( filepath1 ) uf2.read( filepath2 ) #print( 'f1', uf1.lines ) #print( 'f2', uf2.lines ) # Note line counts resultDict['File1']['LineCount'], resultDict['File2']['LineCount'] = len(uf1.lines), len(uf2.lines) if resultDict['File1']['LineCount'] == resultDict['File2']['LineCount']: resultDict['Same']['LineCount'] = resultDict['File1']['LineCount'] else: resultDict['Different']['LineCount'] = (resultDict['File1']['LineCount'],resultDict['File2']['LineCount']) # Work through each file counting chapters and verses, etc. resultDict['File1']['IntroLineCount'] = resultDict['File2']['IntroLineCount'] = 0 resultDict['File1']['ChapterMarkerCount'] = resultDict['File2']['ChapterMarkerCount'] = 0 resultDict['File1']['VerseMarkerCount'] = resultDict['File2']['VerseMarkerCount'] = 0 resultDict['File1']['HasContentCount'] = resultDict['File2']['HasContentCount'] = 0 startedCVs = False lastC = lastV = 0 C = V = '0' for marker,line in uf1.lines: #print( '1', C, V, lastC, lastV, marker, line ) if marker=='c': resultDict['File1']['ChapterMarkerCount'] += 1 C, V, lastV = line.strip(), '0', 0 try: intC = int( C ) except ValueError: intC = -1 startedCVs = True if intC != lastC + 1: if 'File1Chapters' not in resultDict['Summary']: # only record the first one resultDict['Summary']['File1Chapters'] = "{} has chapters out of order ({} after {})".format( file1Name, C, lastC ) lastC = intC elif marker=='v': resultDict['File1']['VerseMarkerCount'] += 1 V = line.strip().split()[0] if '-' in V: # it's a verse bridge V,V2 = V.split( '-', 1 ) else: V2 = None try: intV = int( V ) except ValueError: intV = -1 startedCVs = True # Some one chapter books don't include a chapter marker if intV != lastV + 1: if 'File1Verses' not in resultDict['Summary']: # only record the first one resultDict['Summary']['File1Verses'] = "{} has verses out of order ({}:{} after {}:{})".format( file1Name, C, V, C, lastV ) if V2: lastV = int( V2 ) else: lastV = intV if not startedCVs: resultDict['File1']['IntroLineCount'] += 1 if line.strip(): resultDict['File1']['HasContentCount'] += 1 if '<<<<' in line or '====' in line or '>>>>' in line: if 'File1Conflicts' not in resultDict['Summary']: # only record the first one resultDict['Summary']['File1Conflicts'] = "{} may have a merge conflict around {}:{}".format( file1Name, C, V ) startedCVs = False lastC = lastV = 0 C = V = '0' for marker,line in uf2.lines: #print( '1', C, V, lastC, lastV, marker, line ) if marker=='c': resultDict['File2']['ChapterMarkerCount'] += 1 C, V, lastV = line.strip(), '0', 0 try: intC = int( C ) except ValueError: intC = -1 startedCVs = True if intC != lastC + 1: if 'File2Chapters' not in resultDict['Summary']: # only record the first one resultDict['Summary']['File2Chapters'] = "{} has chapters out of order ({} after {})".format( file2Name, C, lastC ) lastC = intC elif marker=='v': resultDict['File2']['VerseMarkerCount'] += 1 V = line.strip().split()[0] if '-' in V: # it's a verse bridge V,V2 = V.split( '-', 1 ) else: V2 = None try: intV = int( V ) except ValueError: intV = -1 startedCVs = True # Some one chapter books don't include a chapter marker if intV != lastV + 1: if 'File2Verses' not in resultDict['Summary']: # only record the first one resultDict['Summary']['File2Verses'] = "{} has verses out of order ({}:{} after {}:{})".format( file2Name, C, V, C, lastV ) if V2: lastV = int( V2 ) else: lastV = intV if not startedCVs: resultDict['File2']['IntroLineCount'] += 1 if line.strip(): resultDict['File2']['HasContentCount'] += 1 if '<<<<' in line or '====' in line or '>>>>' in line: if 'File2Conflicts' not in resultDict['Summary']: # only record the first one resultDict['Summary']['File2Conflicts'] = "{} may have a merge conflict around {}:{}".format( file2Name, C, V ) if resultDict['File1']['IntroLineCount'] == resultDict['File2']['IntroLineCount']: resultDict['Same']['IntroLineCount'] = resultDict['File1']['IntroLineCount'] else: resultDict['Different']['IntroLineCount'] = (resultDict['File1']['IntroLineCount'],resultDict['File2']['IntroLineCount']) if resultDict['File1']['IntroLineCount'] > resultDict['File2']['IntroLineCount']: difference = resultDict['File1']['IntroLineCount'] - resultDict['File2']['IntroLineCount'] resultDict['Summary']['IntroLineCount'] = "{} has {} more intro marker{}".format( file1Name, difference, '' if difference==1 else 's' ) elif resultDict['File1']['IntroLineCount'] < resultDict['File2']['IntroLineCount']: difference = resultDict['File2']['IntroLineCount'] - resultDict['File1']['IntroLineCount'] resultDict['Summary']['IntroLineCount'] = "{} has {} more intro marker{}".format( file2Name, difference, '' if difference==1 else 's' ) if resultDict['File1']['ChapterMarkerCount'] == resultDict['File2']['ChapterMarkerCount']: resultDict['Same']['ChapterMarkerCount'] = resultDict['File1']['ChapterMarkerCount'] else: resultDict['Different']['ChapterMarkerCount'] = (resultDict['File1']['ChapterMarkerCount'],resultDict['File2']['ChapterMarkerCount']) if resultDict['File1']['ChapterMarkerCount'] > resultDict['File2']['ChapterMarkerCount']: difference = resultDict['File1']['ChapterMarkerCount'] - resultDict['File2']['ChapterMarkerCount'] resultDict['Summary']['ChapterMarkerCount'] = "{} has {} more chapter marker{}".format( file1Name, ) elif resultDict['File1']['ChapterMarkerCount'] < resultDict['File2']['ChapterMarkerCount']: difference = resultDict['File2']['ChapterMarkerCount'] - resultDict['File1']['ChapterMarkerCount'] resultDict['Summary']['ChapterMarkerCount'] = "{} has {} more chapter marker{}".format( file2Name, difference, '' if difference==1 else 's' ) if resultDict['File1']['VerseMarkerCount'] == resultDict['File2']['VerseMarkerCount']: resultDict['Same']['VerseMarkerCount'] = resultDict['File1']['VerseMarkerCount'] else: resultDict['Different']['VerseMarkerCount'] = (resultDict['File1']['VerseMarkerCount'],resultDict['File2']['VerseMarkerCount']) if resultDict['File1']['VerseMarkerCount'] > resultDict['File2']['VerseMarkerCount']: difference = resultDict['File1']['VerseMarkerCount'] - resultDict['File2']['VerseMarkerCount'] resultDict['Summary']['VerseMarkerCount'] = "{} has {} more verse marker{}".format( file1Name, difference, '' if difference==1 else 's' ) elif resultDict['File1']['VerseMarkerCount'] < resultDict['File2']['VerseMarkerCount']: difference = resultDict['File2']['VerseMarkerCount'] - resultDict['File1']['VerseMarkerCount'] resultDict['Summary']['VerseMarkerCount'] = "{} has {} more verse marker{}".format( file2Name, difference, '' if difference==1 else 's' ) if resultDict['File1']['HasContentCount'] == resultDict['File2']['HasContentCount']: resultDict['Same']['HasContentCount'] = resultDict['File1']['HasContentCount'] else: resultDict['Different']['HasContentCount'] = (resultDict['File1']['HasContentCount'],resultDict['File2']['HasContentCount']) if resultDict['File1']['HasContentCount'] > resultDict['File2']['HasContentCount']: difference = resultDict['File1']['HasContentCount'] - resultDict['File2']['HasContentCount'] resultDict['Summary']['HasContentCount'] = "{} has {} more content line{}".format( file1Name, difference, '' if difference==1 else 's' ) elif resultDict['File1']['HasContentCount'] < resultDict['File2']['HasContentCount']: difference = resultDict['File2']['HasContentCount'] - resultDict['File1']['HasContentCount'] resultDict['Summary']['HasContentCount'] = "{} has {} more content line{}".format( file2Name, difference, '' if difference==1 else 's' ) # Work through the files again comparing lines # Trying to resync if there's a different number of lines....................NOT FINISHED YET XXXXXXXXXXXXXXX resultDict['Same']['SameMarkerCount'] = resultDict['Different']['DifferentMarkerCount'] = 0 resultDict['Same']['SameLineCount'] = resultDict['Different']['DifferentLineCount'] = 0 lineIndex = lineOffset = 0 startedCVs1 = startedCVs2 = False while True: if lineIndex >= resultDict['File1']['LineCount']: if BibleOrgSysGlobals.debugFlag and debuggingThisModule: print( "File1 done" ) break if lineIndex >= resultDict['File2']['LineCount']: if BibleOrgSysGlobals.debugFlag and debuggingThisModule: print( "File2 done" ) break (m1,l1), (m2,l2) = uf1.lines[lineIndex], uf2.lines[lineIndex+lineOffset] #print( lineIndex, lineOffset, m1, m2 ) if m1==m2: resultDict['Same']['SameMarkerCount'] += 1 else: if BibleOrgSysGlobals.debugFlag: print( "Diff", m1, m2, l1, l2 ) resultDict['Different']['DifferentMarkerCount'] += 1 if m1==m2 and l1==l2: resultDict['Same']['SameLineCount'] += 1 else: if BibleOrgSysGlobals.debugFlag: print( "Diff", m1, m2, l1, l2 ) resultDict['Different']['DifferentLineCount'] += 1 lineIndex += 1 # Clean up and return for something,value in list( resultDict['Different'].items() ): if not value: del resultDict['Different'][something] return resultDict