class ApplicationBase(object): __metaclass__ = ABCMeta ''' Application Base class to perform many of the basic ETL process ''' def __init__(self): ''' Define the class attributes ''' self.logger = None self.moduleName = None self.awsParams = None self.fileUtilities = None self.bcpUtilities = None self.job = None self.localTempDirectory = None self.location = None self.localTempDataDirectory = None self.etlUtilities = None def BuildTableCreationScript(self, sqlTemplateFilename): ''' Construct the actual DDL script from the template by replacing the appropriate tokens ''' sqlTableCreationTemplate = self.location + '/' + sqlTemplateFilename sqlTableCreationScript = self.localTempDirectory + "/" + re.sub( 'Template.sql$', '.sql', sqlTemplateFilename) self.fileUtilities.CreateActualFileFromTemplate(sqlTableCreationTemplate, sqlTableCreationScript,\ self.job["destinationSchema"], self.job["tableName"]) self.logger.info(self.moduleName + " - SQL files created.") return sqlTableCreationScript def BuildTableCreationScriptTable(self, sqlTemplateFilename, tableName, templateFolder=None, sqlFolder=None): ''' Construct the actual DDL script from the template for the specific table by replacing the appropriate tokens ''' sqlTableCreationTemplate = self.location + '/' if templateFolder is None: sqlTableCreationTemplate = sqlTableCreationTemplate + sqlTemplateFilename else: sqlTableCreationTemplate = sqlTableCreationTemplate + templateFolder + '/' + sqlTemplateFilename sqlTableCreationScript = self.localTempDirectory + "/" if sqlFolder is not None: sqlTableCreationScript = sqlTableCreationScript + sqlFolder + "/" sqlTableCreationScript = sqlTableCreationScript + tableName + re.sub( 'Template.sql$', '.sql', sqlTemplateFilename) # sqlTableCreationScript = self.localTempDirectory + "/" + tableName + re.sub('Template.sql$', '.sql', sqlTemplateFilename) self.fileUtilities.CreateActualFileFromTemplate( sqlTableCreationTemplate, sqlTableCreationScript, self.job["destinationSchema"], tableName) self.logger.info(self.moduleName + " - " + tableName + " - SQL files created.") return sqlTableCreationScript def CreateTables(self, sqlTemplateFilename): ''' Create the actual tables ''' sqlTableCreationScript = self.BuildTableCreationScript( sqlTemplateFilename) # The following code will recreate all the tables. EXISTING DATA WILL BE DELETED RedshiftUtilities.PSqlExecute(sqlTableCreationScript, self.logger) self.logger.info(self.moduleName + " - SQL tables created.") def LoadEnvironmentVariables(self, logger): ''' sub method to just load in all environment variables ''' self.logger = logger # Load the AWS configuration parameters for S3 and Redshift self.awsParams = ConfigureAWS.ConfigureAWS() self.awsParams.LoadAWSConfiguration(self.logger) return self def Start(self, logger, moduleName, filelocs): ''' Start the process. Do the common operations. ''' self = self.LoadEnvironmentVariables(logger) # self.logger = logger self.logger.info(moduleName + " - Getting configuration information.") self.moduleName = moduleName # Load the job parameters self.fileUtilities = FileUtilities(logger) jobConfigFile = self.location + '/' 'jobConfig.json' self.job = self.fileUtilities.LoadJobConfiguration(jobConfigFile) # This is where all the work files will be created self.localTempDirectory = FileUtilities.PathToForwardSlash( filelocs["relativeOutputfolder"] + "/" + moduleName) FileUtilities.CreateFolder(self.localTempDirectory) # This is where all the local data will be located if "relativeInputfolder" in filelocs: self.localTempDataDirectory = FileUtilities.PathToForwardSlash( filelocs["relativeInputfolder"] + "/" + moduleName) FileUtilities.CreateFolder(self.localTempDataDirectory) self.bcpUtilities = BCPUtilities(logger, self.fileUtilities, self.awsParams, self.localTempDirectory) # Create tables if we have a valid script if "sqlScript" in self.job: self.CreateTables(self.job["sqlScript"]) # Create etlprocess log table if it does not already exist if "tblEtl" in filelocs: self.etlUtilities = EtlLoggingUtilities(self.logger) self.etlUtilities.awsParams = self.awsParams self.etlUtilities.filelocs = filelocs self.etlUtilities.moduleName = self.moduleName self.etlUtilities.appschema = filelocs["tblEtl"]["appschema"] self.etlUtilities.StartEtlLogging() if "folders" in self.job: self.fileUtilities.moduleName = self.moduleName self.fileUtilities.localBaseDirectory = self.localTempDirectory self.fileUtilities.CreateFolders(self.job["folders"]) def CreateFolders(self, subFolder): ''' Create the various subfolders defined in the jobConfig.jon for the table being processes ''' self.fileUtilities.moduleName = self.moduleName self.fileUtilities.localBaseDirectory = self.localTempDirectory + "/" + subFolder self.fileUtilities.CreateFolders(self.job["folders"]) def UploadFilesCreateAthenaTablesAndSqlScripts(self, table, localParquetFolderName, partitionValue=None): ''' Upload Parquet files into S3 Create Athena Table/Partition Create script to create a RedShift table and save to S3 (note that the ETL may not necessarily load data into Redshift) Create script to insert data into Redshift and save to S3 (note that the ETL may not necessarily load data into Redshift) ''' if not FileUtilities.FilesExistInFolder(localParquetFolderName + "*.parquet"): # Nothing was created. We have a problem self.logger.info( self.moduleName + " - No parquet files were created for current partition in: " + localParquetFolderName + ". Nothing was processed on Athena.") return False self.fileUtilities.CreateTableSql(table, self.fileUtilities.sqlFolder) scriptPartitionValue = partitionValue if AthenaUtilities.IsTablePartitioned(table): # For partitioned tables, the script will insert a where clause by default. However, if we are doing a new load # skip the where clause so that we can have SQL script that is capable of loading all the data from Athena # into RedShift in the future s3FolderLocation = AthenaUtilities.ComposeAthenaS3DataFileKey( table["schemaName"], table["table"]) if not S3Utilities.KeyExist( self.awsParams, s3FolderLocation ): # Do not update scripts if data has been previously loaded scriptPartitionValue = None AthenaUtilities.SqlToLoadDataFromAthena(self.logger, table, self.fileUtilities.sqlFolder, scriptPartitionValue) AthenaUtilities.UploadFilesAndCreateAthenaTables( self.awsParams, localParquetFolderName, table, self.fileUtilities.sqlFolder, self.logger, partitionValue) return True def LoadDataFromAthenaIntoRedShiftLocalScripts(self, table, customWhereCondition=None): ''' If at a later time we decide to drop the Redshift table and re-load the data from Athena, we need a utility to do that ''' # Under the hood the table will be recreated if the new flag is on or if the table does not exist # Load the data from Athena into RedShift after that. The load query only loads what needed from Athena scriptToCreateRedshiftTable = FileUtilities.ComposeCreateTableSqlFilename( table, self.fileUtilities.sqlFolder) RedshiftUtilities.PSqlExecute(scriptToCreateRedshiftTable, self.logger) scriptToLoadDataFromAthena = AthenaUtilities.ComposeInsertIntoSqlFilename( table, self.fileUtilities.sqlFolder) if customWhereCondition: # Replace the existing where clause with the custom clause customWhereCondition = " AND " + customWhereCondition + ";" replacements = {';': customWhereCondition} scriptToLoadDataFromAthenaCustom = scriptToLoadDataFromAthena + "_custom.sql" self.fileUtilities.ReplaceStringInFile( scriptToLoadDataFromAthena, scriptToLoadDataFromAthenaCustom, replacements) scriptToLoadDataFromAthena = scriptToLoadDataFromAthenaCustom RedshiftUtilities.PSqlExecute(scriptToLoadDataFromAthena, self.logger) def LoadDataFromAthenaIntoRedShiftS3Scripts(self, table): ''' If at a later time we decide to drop the Redshift table and re-load the data from Athena, we need a utility to do that ''' # Download scripts from S3 to local folder AthenaUtilities.DownloadScriptsForRedShift( self.awsParams, table, self.fileUtilities.sqlFolder) self.LoadDataFromAthenaIntoRedShiftLocalScripts(table) def ProcessTables(self, dbCommon, tables): """ Process Tables in the actual derived class """ # YOU MUST IMPLEMENT THIS METHOD IN THE DERIVED CLASS raise NotImplementedError() def ProcessCatalogs(self, dbCommon, catalog): ''' pulls data from each table in the catalog ''' try: self.logger.debug(self.moduleName + " -- ProcessCatalogs for " + catalog["name"] + " starting") for tables in catalog["tables"]: self.ProcessTables(dbCommon, tables) self.logger.debug(self.moduleName + " -- ProcessCatalogs for " + catalog["name"] + " finished ----------.") except: self.logger.exception( self.moduleName + " - we had an error in ProcessCatalogs for " + catalog["name"]) raise def ProcessDatabase(self, databaseSettings): ''' takes the database settings and tries to process them ''' try: self.logger.debug(self.moduleName + " -- ProcessDatabase for " + databaseSettings["common"]["name"] + " starting") for catalog in databaseSettings["catalogs"]: if "execute" not in catalog or catalog["execute"] == 'Y': self.ProcessCatalogs(databaseSettings["common"], catalog) else: self.logger.debug(self.moduleName + " -- ProcessDatabase skip for " + catalog["name"]) if "cleanlocal" in self.job and self.job["cleanlocal"] == "Y": self.fileUtilities.RemoveFolder(self.localTempDirectory) self.logger.debug(self.moduleName + " -- ProcessDatabase for " + databaseSettings["common"]["name"] + " finished") except: self.logger.exception( self.moduleName + " - we had an error in ProcessDatabase for " + databaseSettings["common"]["name"]) raise def ProcessInput(self, logger, moduleName, filelocs): ''' Bootstrap code that process all the databases, catalogs and tables ''' currProcId = None try: self.logger.debug(self.moduleName + " -- " + "Starting...") if "tblEtl" in filelocs: currProcId = self.etlUtilities.GetRunID( filelocs["tblEtl"]["table"], self.moduleName) if "Databases" in self.job: for databaseSettings in self.job["Databases"]: if databaseSettings["execute"] == 'Y': self.ProcessDatabase(databaseSettings) else: self.logger.debug(self.moduleName + " -- Skipping database: " + databaseSettings["common"]["name"]) elif "catalogs" in self.job: self.ProcessDatabase(self.job) elif "tables" in self.job: dbCommon = None if "common" in self.job: dbCommon = self.job["common"] self.ProcessCatalogs(dbCommon, self.job) if self.job["cleanlocal"] == "Y": self.fileUtilities.RemoveFolder(self.localTempDirectory) self.logger.debug(self.moduleName + " -- " + " finished.") except Exception as err: self.logger.exception(moduleName + " - Exception! Error: " + err.message) if "tblEtl" in filelocs and self.etlUtilities.CompleteInstance(\ filelocs["tblEtl"]["table"], currProcId, 'F') is not True: self.logger.info(self.moduleName + " - we could not Complete Instance.") raise Exception(err.message)
class PGCRAirMarketsAthenaSpark(ApplicationBase): ''' Code to process the PGCR Air Markets data ''' def __init__(self): ''' Initial settings ''' super(PGCRAirMarketsAthenaSpark, self).__init__() self.awsParams = "" self.tempFolder = None self.packedFolder = None self.rawDataFolder = None self.toPackFiles = [] self.fileUtilities = FileUtilities(self.logger) self.location = FileUtilities.PathToForwardSlash(os.path.dirname(os.path.abspath(__file__))) def SynchronizeSourceFolder(self): ''' Synchronize the source folder from the AirMarkets bucket in s3 ''' self.logger.info("Synchronizing ZIP files from s3 folder...") S3Utilities.SyncFolderAWSCli("s3://" + self.job["bucketName"] + self.job["s3SrcDirectory"], self.rawDataFolder, True) def CleanUpAndPack(self): ''' Main control to iterate thru the folders cleaning the files and packing them to be uploaded to s3. ''' rawFolders = self.fileUtilities.ScanFolder(self.rawDataFolder) for rawFolderName in rawFolders: self.toPackFiles = [] self.DecompressFromRaw(rawFolderName) self.CleanUpRawCSV(rawFolderName) self.PackFiles(rawFolderName) def PackFiles(self, rawFolderName): ''' Compress the files for a given folder, right now is only the emissions file being packed. ''' self.logger.info("Packing files for folder " + rawFolderName + "...") for csvFile in self.toPackFiles: airMarketGzFile = self.packedFolder + "/" + ntpath.basename(csvFile) + ".gz" self.fileUtilities.GzipFile(csvFile, airMarketGzFile) self.fileUtilities.DeleteFile(csvFile) def CleanUpRawCSV(self, rawFolderName): ''' Performs the clean-up for the emissions files replacing bd characters. ''' allFiles = self.fileUtilities.ScanFolder(self.tempFolder, None, "csv") fileList = [fileName for fileName in allFiles if self.job["srcFileNamePrefix"] in fileName] fileListToDel = [fileName for fileName in allFiles if self.job["srcFileNamePrefix"] not in fileName] self.logger.info("Cleaning up files for folder " + rawFolderName + "...") for airMarketFile in fileList: fullFileName = self.tempFolder + "/" + airMarketFile toPackFileName = self.tempFolder + "/" + self.job["srcFileNamePrefix"] + "_" + rawFolderName + ".csv" self.fileUtilities.ReplaceIterativelyInFile(fullFileName, toPackFileName, [{r"[^\x00-\x76]+":""}, {"'":"`"}]) self.fileUtilities.RemoveLines(toPackFileName, self.job["removeLines"]) self.toPackFiles.append(toPackFileName) self.fileUtilities.DeleteFile(fullFileName) for airMarketFile in fileListToDel: self.fileUtilities.DeleteFile(self.tempFolder + "/" + airMarketFile) def DecompressFromRaw(self, rawFolderName): ''' Extracts the files from the EPADownload.zip file... ''' try: filePath = self.rawDataFolder + "/" + rawFolderName + "/" + self.job["inputZipFileName"] self.logger.info("Unpacking file: " + filePath) self.fileUtilities.UnzipUsing7z(filePath, self.tempFolder) except StandardError as err: self.logger.info("Unable to decompress file: " + filePath + " Error: " + err.message) def UploadPackedToS3(self): ''' Uploads all files packed to s3. ''' self.logger.info("Uploading GZIP files to s3 folder...") S3Utilities.CopyItemsAWSCli(self.packedFolder, "s3://" + self.job["bucketName"] + self.job["s3ToDirectory"], "--recursive --quiet") def LoadAirMarketsTables(self): ''' Performs the final step to insert multiple files located in s3 into the final table in Redshift. ''' try: s3DataFolder = "s3://" + self.job["bucketName"] + self.job["s3ToDirectory"] rsConnect = RedshiftUtilities.Connect(dbname=self.awsParams.redshift['Database'], host=self.awsParams.redshift['Hostname'], port=self.awsParams.redshift['Port'], user=self.awsParams.redshiftCredential['Username'], password=self.awsParams.redshiftCredential['Password']) RedshiftUtilities.LoadDataFromS3(rsConnect, self.awsParams.s3, { "destinationSchema": self.job["destinationSchema"], "tableName": self.job["tableName"] + self.job["srcFileNamePrefix"], "s3Filename": s3DataFolder, "fileFormat": self.job["fileFormat"], "dateFormat": self.job["dateFormat"], "delimiter": self.job["delimiter"] }, self.logger, "N") self.logger.info("Cleaning s3 data folder...") S3Utilities.DeleteFileFromS3TempUsingAWSCLi(s3DataFolder, "--recursive --quiet") except Exception: self.logger.error("Error while trying to save into Redshift from s3 folder.") raise def CleanWorkingFolders(self): ''' Ensures the folders are cleaned and ready before the process execution. ''' self.logger.info("Cleaning local working folders...") FileUtilities.RemoveFolder(self.tempFolder) FileUtilities.RemoveFolder(self.packedFolder) FileUtilities.CreateFolder(self.tempFolder) FileUtilities.CreateFolder(self.packedFolder) def ProcessTable(self,table): ''' Process data for the table :param table: :return: ''' s3Key = self.job["s3Filename"] self.logger.info(self.moduleName + " - Processing file: " + s3Key) self.fileUtilities.moduleName = self.moduleName self.fileUtilities.localBaseDirectory = self.localTempDirectory + "/" + table["table"] self.fileUtilities.CreateFolders(self.job["folders"]) fileName = ntpath.basename(s3Key) local7zipFilePath = self.fileUtilities.gzipFolder+ "/" +fileName S3Utilities.DownloadFileFromS3(self.awsParams.s3,self.job["bucketName"], s3Key,local7zipFilePath) localCsvFilepath = self.fileUtilities.csvFolder + "/" + fileName localCsvFilepath = re.sub(r'\.zip$', '', localCsvFilepath) self.fileUtilities.UnzipUsing7z(local7zipFilePath,localCsvFilepath) fileToBeloaded = localCsvFilepath+'/'+'emission_05-11-2017.csv' spark = SparkUtilities.GetCreateSparkSession(self.logger) schema = SparkUtilities.BuildSparkSchema(table) df = (spark.read .format("com.databricks.spark.csv") .options(header='true', delimiter=self.job["delimiter"],ignoreTrailingWhiteSpace='true') .schema(schema) .load(fileToBeloaded) ) #df.show() self.logger.info( self.moduleName + " -- " + "Done reading " + str(df.count()) + " rows. Now saving as parquet file...") SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts(table,self.fileUtilities.parquet) self.logger.info(self.moduleName + " -- " + "UploadFilesCreateAthenaTablesAndSqlScripts " + " finished ") def Start(self, logger, moduleName, filelocs): ''' Start of routine ''' try: ApplicationBase.Start(self, logger, moduleName, filelocs) self.logger.debug(self.moduleName + " -- " + " starting ") currProcId = self.etlUtilities.GetRunID(filelocs["tblEtl"]["table"], self.moduleName) for table in self.job["tables"]: self.ProcessTable(table) if self.job["cleanlocal"] == "Y": self.fileUtilities.RemoveFolder(self.localTempDirectory) self.logger.debug(self.moduleName + " -- " + " finished ") except Exception as err: self.logger.exception(moduleName + " - Exception! Error: " + err.message) if self.etlUtilities.CompleteInstance(filelocs["tblEtl"]["table"], \ currProcId, 'F') is not True: self.logger.info(self.moduleName + " - we could not Complete Instance.") raise Exception(err.message)