def __init__(self, parameterOne, resourcesObject=None, downloadAllBooks=False): """ Create the Door43 cataloged Bible object. parameterOne can be: a catalog dictionary entry (and second parameter must be None) or an index into the BibleList in the resourcesObject passed as the second parameter """ if BibleOrgSysGlobals.debugFlag and debuggingThisModule: print( f"DCSBible.__init__( {parameterOne}, {resourcesObject}, {downloadAllBooks} )…" ) if isinstance(parameterOne, dict): assert resourcesObject is None resourceDict = parameterOne else: assert isinstance(parameterOne, int) assert resourcesObject # why ??? and isinstance( resourcesObject, Door43CatalogResources ) resourceDict = resourcesObject.getBibleResourceDict(parameterOne) assert resourceDict and isinstance(resourceDict, dict) #print( 'resourceDict', resourceDict ) #print( 'resourceDict', resourceDict.keys() ) self.baseURL = resourceDict['html_url'] #print( 'self.baseURL', self.baseURL ) adjustedRepoName = resourceDict['full_name'].replace('/', '--') #print( 'adjustedRepoName', adjustedRepoName ) desiredFolderName = BibleOrgSysGlobals.makeSafeFilename( adjustedRepoName) unzippedFolderPath = os.path.join( BibleOrgSysGlobals.DOWNLOADED_RESOURCES_FOLDER, 'Door43ContentServiceOnline/', f"{adjustedRepoName}/") if downloadAllBooks: # See if files already exist and are current (so don't download again) alreadyDownloadedFlag = False if os.path.isdir(unzippedFolderPath): #print( f"Issued: {resourceDict['issued']}" ) updatedDatetime = datetime.strptime(resourceDict['updated_at'], '%Y-%m-%dT%H:%M:%SZ') #print( f"updatedDatetime: {updatedDatetime}" ) #print( f"folder: {os.stat(unzippedFolderPath).st_mtime}" ) folderModifiedDatetime = datetime.fromtimestamp( os.stat(unzippedFolderPath).st_mtime) #print( f"folderModifiedDatetime: {folderModifiedDatetime}" ) alreadyDownloadedFlag = folderModifiedDatetime > updatedDatetime #print( f"alreadyDownloadedFlag: {alreadyDownloadedFlag}" ) if alreadyDownloadedFlag: if BibleOrgSysGlobals.verbosityLevel > 1: print( "Skipping download because folder '{}' already exists." .format(unzippedFolderPath)) else: # Download the zip file (containing all the USFM files, README.md, LICENSE.md, manifest.yaml, etc.) # TODO: Change to .tar.gz instead of zip zipURL = self.baseURL + '/archive/master.zip' # '/archive/master.tar.gz' if BibleOrgSysGlobals.verbosityLevel > 1: print("Downloading entire repo from '{}'…".format(zipURL)) try: HTTPResponseObject = urllib.request.urlopen(zipURL) except urllib.error.URLError as err: #errorClass, exceptionInstance, traceback = sys.exc_info() #print( '{!r} {!r} {!r}'.format( errorClass, exceptionInstance, traceback ) ) logging.critical("DCS URLError '{}' from {}".format( err, zipURL)) return # print( " HTTPResponseObject", HTTPResponseObject ) contentType = HTTPResponseObject.info().get('content-type') if BibleOrgSysGlobals.debugFlag and debuggingThisModule: print(" contentType", repr(contentType)) if contentType == 'application/octet-stream': try: os.makedirs(unzippedFolderPath) except FileExistsError: pass downloadedData = HTTPResponseObject.read() if BibleOrgSysGlobals.verbosityLevel > 0: print( f" Downloaded {len(downloadedData):,} bytes from '{zipURL}'" ) # Bug in Python up to 3.7 makes this not work for large aligned Bibles (3+ MB) # myTempFile = tempfile.SpooledTemporaryFile() myTempFile = tempfile.TemporaryFile() myTempFile.write(downloadedData) with zipfile.ZipFile(myTempFile) as myzip: # NOTE: Could be a security risk here myzip.extractall(unzippedFolderPath) myTempFile.close() # Automatically deletes the file else: print(" contentType", repr(contentType)) halt # unknown content type self.downloadedAllBooks = True # There's probably a folder inside this folder folders = os.listdir(unzippedFolderPath) #print( 'folders', folders ) assert len( folders ) == 1 # else maybe a previous download failed -- just manually delete the folder desiredFolderName = folders[0] + '/' #print( 'desiredFolderName', desiredFolderName ) USFMBible.__init__(self, os.path.join(unzippedFolderPath, desiredFolderName), givenName=resourceDict['name']) else: self.downloadedAllBooks = False self.attemptedDownload = {} try: os.makedirs(unzippedFolderPath) except FileExistsError: pass USFMBible.__init__(self, unzippedFolderPath, givenName=resourceDict['name']) self.objectNameString = 'DCS USFM Bible object'
def __init__(self, parameterOne, resourcesObject=None): """ Create the Door43 cataloged Bible object. parameterOne can be: a catalog dictionary entry (and second parameter must be None) or an index into the BibleList in the resourcesObject passed as the second parameter """ if BibleOrgSysGlobals.debugFlag and debuggingThisModule: print( f"Door43CatalogBible.__init__( {parameterOne}, {resourcesObject} )…" ) if isinstance(parameterOne, dict): assert resourcesObject is None resourceDict = parameterOne else: assert isinstance(parameterOne, int) assert resourcesObject # why ??? and isinstance( resourcesObject, Door43CatalogResources ) resourceDict = resourcesObject.getBibleResourceDict(parameterOne) assert resourceDict and isinstance(resourceDict, dict) #print( 'resourceDict', resourceDict ) #print( 'resourceDict', resourceDict.keys() ) if debuggingThisModule: print('formats', resourceDict['formats']) if 'formats' in resourceDict: formats = resourceDict['formats'] else: assert len(resourceDict['projects']) == 1 formats = resourceDict['projects'][0]['formats'] assert formats for formatDict in formats: #print( 'formatDict', formatDict ) formatString = formatDict['format'] if 'application/zip;' in formatString and 'usfm' in formatString: size, zipURL = formatDict['size'], formatDict['url'] break else: logging.critical( f"No zip URL found for '{resourceDict['language']}' '{resourceDict['title']}'" ) return # See if files already exist and are current (so don't download again) alreadyDownloadedFlag = False unzippedFolderPath = os.path.join( BibleOrgSysGlobals.DOWNLOADED_RESOURCES_FOLDER, 'Door43Catalog/', f"{resourceDict['language']}_{resourceDict['title']}/") if os.path.isdir(unzippedFolderPath): #print( f"Issued: {resourceDict['issued']}" ) issuedDatetime = datetime.strptime(resourceDict['issued'], '%Y-%m-%dT%H:%M:%S+00:00') #print( f"issuedDatetime: {issuedDatetime}" ) #print( f"folder: {os.stat(unzippedFolderPath).st_mtime}" ) folderModifiedDatetime = datetime.fromtimestamp( os.stat(unzippedFolderPath).st_mtime) #print( f"folderModifiedDatetime: {folderModifiedDatetime}" ) alreadyDownloadedFlag = folderModifiedDatetime > issuedDatetime #print( f"alreadyDownloadedFlag: {alreadyDownloadedFlag}" ) if alreadyDownloadedFlag: if BibleOrgSysGlobals.verbosityLevel > 1: print("Skipping download because folder '{}' already exists.". format(unzippedFolderPath)) else: # Download the zip file (containing all the USFM files, LICENSE.md, manifest.yaml, etc.) if BibleOrgSysGlobals.verbosityLevel > 1: print("Downloading {:,} bytes from '{}'…".format(size, zipURL)) try: HTTPResponseObject = urllib.request.urlopen(zipURL) except urllib.error.URLError as err: #errorClass, exceptionInstance, traceback = sys.exc_info() #print( '{!r} {!r} {!r}'.format( errorClass, exceptionInstance, traceback ) ) logging.critical("Door43 URLError '{}' from {}".format( err, zipURL)) return None # print( " HTTPResponseObject", HTTPResponseObject ) contentType = HTTPResponseObject.info().get('content-type') if BibleOrgSysGlobals.debugFlag and debuggingThisModule: print(" contentType", contentType) if contentType == 'application/zip': try: os.makedirs(unzippedFolderPath) except FileExistsError: pass # Bug in Python up to 3.7 makes this not work for large aligned Bibles (3+ MB) # myTempFile = tempfile.SpooledTemporaryFile() myTempFile = tempfile.TemporaryFile() myTempFile.write(HTTPResponseObject.read()) with zipfile.ZipFile(myTempFile) as myzip: # NOTE: Could be a security risk here myzip.extractall(unzippedFolderPath) else: halt # unknown content type # There's probably a folder inside this folder folders = os.listdir(unzippedFolderPath) #print( 'folders', folders ) assert len(folders) == 1 desiredFolderName = folders[0] + '/' #print( 'desiredFolderName', desiredFolderName ) USFMBible.__init__(self, os.path.join(unzippedFolderPath, desiredFolderName), givenName=resourceDict['title'], givenAbbreviation=resourceDict['identifier']) self.objectNameString = 'Door43 USFM Bible object'