class Vantage(ApplicationBase): ''' This class is used to get the Vanatage data from IHS Vantage Database, transform it and load it into Redshift. ''' def __init__(self): ''' Initial settings ''' super(Vantage, self).__init__() self.awsParams = "" self.packedFolder = None self.rawFolder = None self.fileUtilities = FileUtilities(self.logger) self.location = FileUtilities.PathToForwardSlash( os.path.dirname(os.path.abspath(__file__))) def BulkExtractAll(self): ''' Controls the flow thru the different data sets coming from Vantage DB. ''' try: for dsScript in self.job["extractingScripts"]: self.logger.info(self.moduleName + " Starts extracting " + dsScript["tableSuffix"] + " data...") self.bcpUtilities.RunBCPJob( self.job["mssqlLoginInfo"], self.job["bcpUtilityDirOnLinux"], self.fileUtilities.LoadSQLQuery(self.location + dsScript["scriptFile"]), self.localTempDirectory + "/Raw/" + dsScript["tableSuffix"] + ".CSV", self.job["delimiter"]) except Exception as err: self.logger.error( "Error while trying to Bulk Extract all. Message: " + err.message) raise def TransformAndPackAll(self): ''' Compress the csv files created. ''' rawFiles = self.fileUtilities.ScanFolder(self.rawFolder, None, "CSV") try: for rFile in rawFiles: rFileFull = self.rawFolder + "/" + rFile self.logger.info(self.moduleName + " started compressing file: " + rFile) self.fileUtilities.GzipFile( rFileFull, self.packedFolder + "/" + rFile + ".GZ") self.fileUtilities.RemoveFileIfItExists(rFileFull) except Exception as err: self.logger.error(self.moduleName + " Error while compressing raw files. Message: " + err.message) raise def LoadAllFromS3(self): ''' Load all CSVs from the Vantage's S3 bucket into Redshift ''' rsConnect = None try: s3DataFolder = "s3://" + self.job["bucketName"] + self.job[ "s3ToDirectory"] rsConnect = RedshiftUtilities.Connect( dbname=self.awsParams.redshift['Database'], host=self.awsParams.redshift['Hostname'], port=self.awsParams.redshift['Port'], user=self.awsParams.redshiftCredential['Username'], password=self.awsParams.redshiftCredential['Password']) for dsScript in self.job["extractingScripts"]: RedshiftUtilities.LoadDataFromS3( rsConnect, self.awsParams.s3, { "destinationSchema": self.job["destinationSchema"], "tableName": self.job["tableName"] + dsScript["tableSuffix"], "s3Filename": s3DataFolder + "/" + dsScript["tableSuffix"] + ".CSV.GZ", "fileFormat": self.job["fileFormat"], "dateFormat": self.job["dateFormat"], "delimiter": self.job["delimiter"] }, self.logger, "N") self.logger.info(self.moduleName + " - Cleaning s3 data folder...") S3Utilities.DeleteFileFromS3TempUsingAWSCLi( s3DataFolder, "--recursive --quiet") except Exception: self.logger.error( self.moduleName + " - Error while trying to save into Redshift from s3 folder.") raise finally: if rsConnect is not None: rsConnect.close() def BulkUploadToS3(self): ''' Uploads all GZIP files created into S3 to be uploaded later... ''' self.logger.info(self.moduleName + " - Uploading GZIP files to s3 folder...") S3Utilities.CopyItemsAWSCli( self.packedFolder, "s3://" + self.job["bucketName"] + self.job["s3ToDirectory"], "--recursive --quiet") def Start(self, logger, moduleName, filelocs): try: ApplicationBase.Start(self, logger, moduleName, filelocs) self.packedFolder = self.localTempDirectory + "/Packed" self.rawFolder = self.localTempDirectory + "/Raw" self.fileUtilities.RemoveFolder(self.packedFolder) self.fileUtilities.RemoveFolder(self.rawFolder) self.fileUtilities.CreateFolder(self.packedFolder) self.fileUtilities.CreateFolder(self.rawFolder) self.BulkExtractAll() self.TransformAndPackAll() self.BulkUploadToS3() self.LoadAllFromS3() except Exception as err: self.logger.exception(moduleName + " - Exception! Error: " + err.message) raise Exception(err.message)
class PGCRAirMarkets(ApplicationBase): ''' Code to process the PGCR Air Markets data ''' def __init__(self): ''' Initial settings ''' super(PGCRAirMarkets, self).__init__() self.awsParams = "" self.tempFolder = None self.packedFolder = None self.rawDataFolder = None self.toPackFiles = [] self.fileUtilities = FileUtilities(self.logger) self.location = FileUtilities.PathToForwardSlash( os.path.dirname(os.path.abspath(__file__))) def SynchronizeSourceFolder(self): ''' Synchronize the source folder from the AirMarkets bucket in s3 ''' self.logger.info("Synchronizing ZIP files from s3 folder...") S3Utilities.SyncFolderAWSCli( "s3://" + self.job["bucketName"] + self.job["s3SrcDirectory"], self.rawDataFolder, True) def CleanUpAndPack(self): ''' Main control to iterate thru the folders cleaning the files and packing them to be uploaded to s3. ''' rawFolders = self.fileUtilities.ScanFolder(self.rawDataFolder) for rawFolderName in rawFolders: self.toPackFiles = [] self.DecompressFromRaw(rawFolderName) self.CleanUpRawCSV(rawFolderName) self.PackFiles(rawFolderName) def PackFiles(self, rawFolderName): ''' Compress the files for a given folder, right now is only the emissions file being packed. ''' self.logger.info("Packing files for folder " + rawFolderName + "...") for csvFile in self.toPackFiles: airMarketGzFile = self.packedFolder + "/" + ntpath.basename( csvFile) + ".gz" self.fileUtilities.GzipFile(csvFile, airMarketGzFile) self.fileUtilities.DeleteFile(csvFile) def CleanUpRawCSV(self, rawFolderName): ''' Performs the clean-up for the emissions files replacing bd characters. ''' allFiles = self.fileUtilities.ScanFolder(self.tempFolder, None, "csv") fileList = [ fileName for fileName in allFiles if self.job["srcFileNamePrefix"] in fileName ] fileListToDel = [ fileName for fileName in allFiles if self.job["srcFileNamePrefix"] not in fileName ] self.logger.info("Cleaning up files for folder " + rawFolderName + "...") for airMarketFile in fileList: fullFileName = self.tempFolder + "/" + airMarketFile toPackFileName = self.tempFolder + "/" + self.job[ "srcFileNamePrefix"] + "_" + rawFolderName + ".csv" self.fileUtilities.ReplaceIterativelyInFile( fullFileName, toPackFileName, [{ r"[^\x00-\x76]+": "" }, { "'": "`" }]) self.logger.info("toPackFileName" + toPackFileName) fc = self.fileUtilities.GetFileContents(toPackFileName) if fc is not None and len(fc) > 2: self.logger.info("2nd line from the file") self.logger.info(fc[1].strip()) else: self.logger.info("File is empty") lineCount = self.fileUtilities.GetLineCount(toPackFileName) if lineCount > 0: self.fileUtilities.RemoveLines(toPackFileName, self.job["removeLines"]) self.toPackFiles.append(toPackFileName) else: self.logger.warning("Empty file: " + toPackFileName) self.fileUtilities.DeleteFile(fullFileName) for airMarketFile in fileListToDel: self.fileUtilities.DeleteFile(self.tempFolder + "/" + airMarketFile) def DecompressFromRaw(self, rawFolderName): ''' Extracts the files from the EPADownload.zip file... ''' try: filePath = self.rawDataFolder + "/" + rawFolderName + "/" + self.job[ "inputZipFileName"] self.logger.info("Unpacking file: " + filePath) self.fileUtilities.UnzipUsing7z(filePath, self.tempFolder) except Exception as err: self.logger.info("Unable to decompress file: " + filePath + " Error: " + err.message) def UploadPackedToS3(self): ''' Uploads all files packed to s3. ''' self.logger.info("Uploading GZIP files to s3 folder...") S3Utilities.CopyItemsAWSCli( self.packedFolder, "s3://" + self.job["bucketName"] + self.job["s3ToDirectory"], "--recursive --quiet") def LoadAirMarketsTables(self): ''' Performs the final step to insert multiple files located in s3 into the final table in Redshift. ''' try: s3DataFolder = "s3://" + self.job["bucketName"] + self.job[ "s3ToDirectory"] rsConnect = RedshiftUtilities.Connect( dbname=self.awsParams.redshift['Database'], host=self.awsParams.redshift['Hostname'], port=self.awsParams.redshift['Port'], user=self.awsParams.redshiftCredential['Username'], password=self.awsParams.redshiftCredential['Password']) RedshiftUtilities.LoadDataFromS3( rsConnect, self.awsParams.s3, { "destinationSchema": self.job["destinationSchema"], "tableName": self.job["tableName"] + self.job["srcFileNamePrefix"], "s3Filename": s3DataFolder, "fileFormat": self.job["fileFormat"], "dateFormat": self.job["dateFormat"], "delimiter": self.job["delimiter"] }, self.logger, "N") self.logger.info("Cleaning s3 data folder...") S3Utilities.DeleteFileFromS3TempUsingAWSCLi( s3DataFolder, "--recursive --quiet") except Exception: self.logger.error( "Error while trying to save into Redshift from s3 folder.") raise def CleanWorkingFolders(self): ''' Ensures the folders are cleaned and ready before the process execution. ''' self.logger.info("Cleaning local working folders...") FileUtilities.RemoveFolder(self.tempFolder) FileUtilities.RemoveFolder(self.packedFolder) FileUtilities.CreateFolder(self.tempFolder) FileUtilities.CreateFolder(self.packedFolder) def Start(self, logger, moduleName, filelocs): ''' Start of routine ''' try: ApplicationBase.Start(self, logger, moduleName, filelocs) self.tempFolder = self.localTempDirectory + "/Temp" self.packedFolder = self.localTempDirectory + "/Packed" self.rawDataFolder = self.localTempDirectory + "/RawData" self.CleanWorkingFolders() self.SynchronizeSourceFolder() self.CleanUpAndPack() self.UploadPackedToS3() self.LoadAirMarketsTables() except: logger.exception(moduleName + " - Exception!") raise
class Consensus(ApplicationBase): ''' This class is used to control the data load process from different OPEC file sources. ''' def __init__(self): ''' Initial settings ''' super(Consensus, self).__init__() self.awsParams = "" self.rawFolder = None self.csvFolder = None self.fileUtilities = FileUtilities(self.logger) self.location = FileUtilities.PathToForwardSlash( os.path.dirname(os.path.abspath(__file__))) def BulkDownload(self): ''' Download all files. ''' sharedFiles = self.fileUtilities.ScanFolder( self.job["srcSharedFolder"]) self.logger.info(self.moduleName + " - Downloading files from shared folder...") for fileName in sharedFiles: if (fileName[:2] == self.job["fileNamePrefix"] ) and os.path.splitext(fileName)[1] in self.job["validExts"]: shutil.copyfile( os.path.join(self.job["srcSharedFolder"], fileName), self.rawFolder + "/" + fileName) def DfCleanUp(self, df, surveyDateVal): ''' Converts the actual excel file into csv for the worksheet configured. ''' bankNameColumnIn = "Unnamed: 0" surveyDateColName = "surveyDate" for colName in self.job["columnsToDrop"]: df = df.drop(colName, 1) df = df.drop(self.job["dropAfterHeader"], 0) for colName in df.head(0): dtTest = colName if not isinstance(dtTest, datetime) and colName != bankNameColumnIn: df = df.drop(colName, 1) df = df.assign(surveyDate=surveyDateVal) newOrder = [surveyDateColName] for colName in df.head(0): if colName != surveyDateColName: newOrder.append(colName) df = df[newOrder] df = df.melt(id_vars=[surveyDateColName, bankNameColumnIn]) return df def LoadAllFromS3(self): ''' Process a single category configured in the categories dictionary in the jobConfig. ''' try: s3DataFolder = "s3://" + self.job["bucketName"] + self.job[ "s3ToDirectory"] rsConnect = RedshiftUtilities.Connect( dbname=self.awsParams.redshift['Database'], host=self.awsParams.redshift['Hostname'], port=self.awsParams.redshift['Port'], user=self.awsParams.redshiftCredential['Username'], password=self.awsParams.redshiftCredential['Password']) RedshiftUtilities.LoadDataFromS3( rsConnect, self.awsParams.s3, { "destinationSchema": self.job["destinationSchema"], "tableName": self.job["tableName"], "s3Filename": s3DataFolder, "fileFormat": self.job["fileFormat"], "dateFormat": self.job["dateFormat"], "delimiter": self.job["delimiter"] }, self.logger, "N") self.logger.info(self.moduleName + " - Cleaning s3 data folder...") S3Utilities.DeleteFileFromS3TempUsingAWSCLi( s3DataFolder, "--recursive --quiet") except Exception: self.logger.error( self.moduleName + " - Error while trying to save into Redshift from s3 folder.") raise def GetData(self, rawFileName, mode=None): ''' Returns the data frame or survey date ''' if mode == "getSurveyDate": skipRows = 0 else: skipRows = self.job["skipRows"] df = pandas.read_excel(rawFileName, sheetname=self.job["worksheetName"], index_col=None, na_values=["na"], skiprows=skipRows, skip_footer=self.job["skipFooter"]) if mode == "getSurveyDate": valRerturn = df.iloc[self.job["surveyDateRow"] - 2][0] else: valRerturn = df return valRerturn @staticmethod def FormatSurveyDate(emFile): ''' Returns the date based on the file's name ''' surveyDateColValue = os.path.splitext(emFile)[0] surveyDateColValue = surveyDateColValue[2:len(surveyDateColValue)] surveyDateColValue = surveyDateColValue.replace("CF", "") surveyDateColValue = str(surveyDateColValue[3:]) + "-" + str( list(calendar.month_abbr).index(surveyDateColValue[:3])) + "-01" return surveyDateColValue def ProcessFiles(self): ''' Controls the workflow for the conversion, clean up and pack of the input files. ''' filesToProcess = self.fileUtilities.ScanFolder(self.rawFolder) for emFile in filesToProcess: self.logger.info(self.moduleName + " - Processing file: " + emFile) rawFileName = self.rawFolder + "/" + emFile csvFilename = self.csvFolder + "/" + os.path.splitext( emFile)[0] + ".csv" try: surveyDatedt = self.GetData(rawFileName, "getSurveyDate") if isinstance(surveyDatedt, float): surveyDatedt = self.FormatSurveyDate(emFile) df = self.GetData(rawFileName) df = self.DfCleanUp(df, surveyDatedt) df.to_csv(csvFilename, header=False, sep=str(self.job["delimiter"]), encoding='utf-8', index=False) self.fileUtilities.GzipFile(csvFilename, csvFilename + ".gz") self.fileUtilities.DeleteFile(csvFilename) except XLRDError: self.logger.info(self.moduleName + " - No tab named '" + self.job["worksheetName"] + "' in " + emFile) except Exception: self.logger.error(self.moduleName + " - Error while trying to process " + emFile) raise finally: FileUtilities.RemoveFileIfItExists(rawFileName) def CheckWorkingFolders(self): ''' Check if the working folders are out there to re-create them ''' self.logger.info(self.moduleName + "Checking on working folders...") FileUtilities.RemoveFolder(self.rawFolder) FileUtilities.RemoveFolder(self.csvFolder) FileUtilities.CreateFolder(self.rawFolder) FileUtilities.CreateFolder(self.csvFolder) def BulkUploadToS3(self): ''' Uploads all GZIP files created into S3 to be uploaded later... ''' self.logger.info(self.moduleName + " - Uploading GZIP files to s3 folder...") S3Utilities.CopyItemsAWSCli( self.csvFolder, "s3://" + self.job["bucketName"] + self.job["s3ToDirectory"], "--recursive --quiet") def Start(self, logger, moduleName, filelocs): currProcId = None try: ApplicationBase.Start(self, logger, moduleName, filelocs) self.logger.debug(self.moduleName + " -- " + " starting ") currProcId = self.etlUtilities.GetRunID( filelocs["tblEtl"]["table"], self.moduleName) self.rawFolder = self.localTempDirectory + "/" + "Raw" self.csvFolder = self.localTempDirectory + "/" + "CSV" self.CheckWorkingFolders() self.BulkDownload() self.ProcessFiles() self.BulkUploadToS3() self.LoadAllFromS3() if self.job["cleanlocal"] == "Y": self.fileUtilities.RemoveFolder(self.localTempDirectory) self.logger.debug(self.moduleName + " -- " + " finished ") except Exception as err: self.logger.exception(moduleName + " - Exception! Error: " + err.message) if self.etlUtilities.CompleteInstance(filelocs["tblEtl"]["table"],\ currProcId, 'F') is not True: self.logger.info(self.moduleName + " - we could not Complete Instance.") raise Exception(err.message)
class AutoInsight(ApplicationBase): ''' This class is used to control the data load process for Auto Insight. ''' def __init__(self): ''' Initial settings ''' super(AutoInsight, self).__init__() self.awsParams = "" self.processingFile = None self.fileUtilities = FileUtilities(self.logger) self.location = FileUtilities.PathToForwardSlash(os.path.dirname(os.path.abspath(__file__))) def BulkDownload(self): ''' Download all files. ''' sharedFiles = self.fileUtilities.ScanFolder(self.job["srcSharedFolder"]) self.logger.info(self.moduleName + " - Downloading files from shared folder...") for fileName in sharedFiles: if fileName == self.job["fileName"]: self.processingFile = fileName shutil.copyfile(os.path.join(self.job["srcSharedFolder"], fileName), self.localTempDirectory + "/" + self.processingFile) def LoadAllFromS3(self): ''' Process a single category configured in the categories dictionary in the jobConfig. ''' try: s3DataFolder = "s3://" + self.job["bucketName"] + self.job["s3ToDirectory"] rsConnect = RedshiftUtilities.Connect(dbname=self.awsParams.redshift['Database'], host=self.awsParams.redshift['Hostname'], port=self.awsParams.redshift['Port'], user=self.awsParams.redshiftCredential['Username'], password=self.awsParams.redshiftCredential['Password']) RedshiftUtilities.LoadDataFromS3(rsConnect, self.awsParams.s3, { "destinationSchema": self.job["destinationSchema"], "tableName": self.job["ddl"]["table"], "s3Filename": s3DataFolder, "fileFormat": self.job["fileFormat"], "dateFormat": self.job["dateFormat"], "delimiter": self.job["delimiter"] }, self.logger, "N") self.logger.info(self.moduleName + " - Cleaning s3 data folder...") S3Utilities.DeleteFileFromS3TempUsingAWSCLi(s3DataFolder, "--recursive --quiet") except Exception: self.logger.error(self.moduleName + " - Error while trying to save into Redshift from s3 folder.") raise @staticmethod def FormatColNameDate(dtText): ''' Converts the abbreviated date to YYYY-MM-01 format ''' textFixed = None if sys.version[0] == '2': customException = StandardError() elif sys.version[0] == '3': customException = Exception() try: textFixed = str(dtText[4:]) + "-" + str(list(calendar.month_abbr).index(dtText[:3])) + "-01" except customException: textFixed = dtText return textFixed def ProcessFiles(self): ''' Controls the workflow for the conversion, clean up and pack of the input files. ''' self.logger.info(self.moduleName + " - Processing file: " + self.processingFile) rawFileName = self.localTempDirectory + "/" + self.processingFile csvFilename = self.localTempDirectory + "/" + self.processingFile.split(".")[0] + ".csv" try: columnNames = [] df = pandas.read_excel(rawFileName, sheetname=self.job["worksheetName"], index_col=None, na_values=None, skiprows=self.job["skipRows"], skip_footer=self.job["skipFooter"]) for colName in df.head(0): if colName not in self.job["columns_no_melt"]: columnNames.append(self.FormatColNameDate(colName)) else: columnNames.append(colName) df.columns = columnNames df = df.melt(id_vars=self.job["columns_no_melt"]) df.to_csv(csvFilename, header=False, sep=str(self.job["delimiter"]), encoding='utf-8', index=False) self.fileUtilities.GzipFile(csvFilename, csvFilename + ".gz") self.fileUtilities.DeleteFile(csvFilename) except XLRDError: self.logger.info(self.moduleName + " - No tab named '" + self.job["worksheetName"] + "' in " + self.processingFile) except Exception: self.logger.error(self.moduleName + " - Error while trying to process file " + self.processingFile) raise finally: FileUtilities.RemoveFileIfItExists(rawFileName) def BulkUploadToS3(self): ''' Uploads all GZIP files created into S3 to be uploaded later... ''' self.logger.info(self.moduleName + " - Uploading GZIP files to s3 folder...") fileName = self.processingFile.split(".")[0] + ".csv.gz" S3Utilities.CopyItemsAWSCli(self.localTempDirectory + "/" + fileName, "s3://" + self.job["bucketName"] + self.job["s3ToDirectory"], "--quiet") def ExecuteCreateTable(self): ''' Checks if the tables needs to be created ''' tb = self.job['ddl'] tb['schemaName'] = self.job['destinationSchema'] fname = self.fileUtilities.CreateTableSql(tb, self.localTempDirectory) RedshiftUtilities.PSqlExecute(fname, self.logger) def Start(self, logger, moduleName, filelocs): currProcId = None try: ApplicationBase.Start(self, logger, moduleName, filelocs) self.logger.debug(self.moduleName + " -- " + " starting ") currProcId = self.etlUtilities.GetRunID(filelocs["tblEtl"]["table"], self.moduleName) self.ExecuteCreateTable() self.BulkDownload() self.ProcessFiles() self.BulkUploadToS3() self.LoadAllFromS3() if self.job["cleanlocal"] == "Y": self.fileUtilities.RemoveFolder(self.localTempDirectory) self.logger.debug(self.moduleName + " -- " + " finished ") except Exception as err: self.logger.exception(moduleName + " - Exception! Error: " + err.message) if self.etlUtilities.CompleteInstance(filelocs["tblEtl"]["table"], currProcId, 'F') is not True: self.logger.info(self.moduleName + " - we could not Complete Instance.") raise Exception()
class PGCRAirMarketsAthenaSpark(ApplicationBase): ''' Code to process the PGCR Air Markets data ''' def __init__(self): ''' Initial settings ''' super(PGCRAirMarketsAthenaSpark, self).__init__() self.awsParams = "" self.tempFolder = None self.packedFolder = None self.rawDataFolder = None self.toPackFiles = [] self.fileUtilities = FileUtilities(self.logger) self.location = FileUtilities.PathToForwardSlash(os.path.dirname(os.path.abspath(__file__))) def SynchronizeSourceFolder(self): ''' Synchronize the source folder from the AirMarkets bucket in s3 ''' self.logger.info("Synchronizing ZIP files from s3 folder...") S3Utilities.SyncFolderAWSCli("s3://" + self.job["bucketName"] + self.job["s3SrcDirectory"], self.rawDataFolder, True) def CleanUpAndPack(self): ''' Main control to iterate thru the folders cleaning the files and packing them to be uploaded to s3. ''' rawFolders = self.fileUtilities.ScanFolder(self.rawDataFolder) for rawFolderName in rawFolders: self.toPackFiles = [] self.DecompressFromRaw(rawFolderName) self.CleanUpRawCSV(rawFolderName) self.PackFiles(rawFolderName) def PackFiles(self, rawFolderName): ''' Compress the files for a given folder, right now is only the emissions file being packed. ''' self.logger.info("Packing files for folder " + rawFolderName + "...") for csvFile in self.toPackFiles: airMarketGzFile = self.packedFolder + "/" + ntpath.basename(csvFile) + ".gz" self.fileUtilities.GzipFile(csvFile, airMarketGzFile) self.fileUtilities.DeleteFile(csvFile) def CleanUpRawCSV(self, rawFolderName): ''' Performs the clean-up for the emissions files replacing bd characters. ''' allFiles = self.fileUtilities.ScanFolder(self.tempFolder, None, "csv") fileList = [fileName for fileName in allFiles if self.job["srcFileNamePrefix"] in fileName] fileListToDel = [fileName for fileName in allFiles if self.job["srcFileNamePrefix"] not in fileName] self.logger.info("Cleaning up files for folder " + rawFolderName + "...") for airMarketFile in fileList: fullFileName = self.tempFolder + "/" + airMarketFile toPackFileName = self.tempFolder + "/" + self.job["srcFileNamePrefix"] + "_" + rawFolderName + ".csv" self.fileUtilities.ReplaceIterativelyInFile(fullFileName, toPackFileName, [{r"[^\x00-\x76]+":""}, {"'":"`"}]) self.fileUtilities.RemoveLines(toPackFileName, self.job["removeLines"]) self.toPackFiles.append(toPackFileName) self.fileUtilities.DeleteFile(fullFileName) for airMarketFile in fileListToDel: self.fileUtilities.DeleteFile(self.tempFolder + "/" + airMarketFile) def DecompressFromRaw(self, rawFolderName): ''' Extracts the files from the EPADownload.zip file... ''' try: filePath = self.rawDataFolder + "/" + rawFolderName + "/" + self.job["inputZipFileName"] self.logger.info("Unpacking file: " + filePath) self.fileUtilities.UnzipUsing7z(filePath, self.tempFolder) except StandardError as err: self.logger.info("Unable to decompress file: " + filePath + " Error: " + err.message) def UploadPackedToS3(self): ''' Uploads all files packed to s3. ''' self.logger.info("Uploading GZIP files to s3 folder...") S3Utilities.CopyItemsAWSCli(self.packedFolder, "s3://" + self.job["bucketName"] + self.job["s3ToDirectory"], "--recursive --quiet") def LoadAirMarketsTables(self): ''' Performs the final step to insert multiple files located in s3 into the final table in Redshift. ''' try: s3DataFolder = "s3://" + self.job["bucketName"] + self.job["s3ToDirectory"] rsConnect = RedshiftUtilities.Connect(dbname=self.awsParams.redshift['Database'], host=self.awsParams.redshift['Hostname'], port=self.awsParams.redshift['Port'], user=self.awsParams.redshiftCredential['Username'], password=self.awsParams.redshiftCredential['Password']) RedshiftUtilities.LoadDataFromS3(rsConnect, self.awsParams.s3, { "destinationSchema": self.job["destinationSchema"], "tableName": self.job["tableName"] + self.job["srcFileNamePrefix"], "s3Filename": s3DataFolder, "fileFormat": self.job["fileFormat"], "dateFormat": self.job["dateFormat"], "delimiter": self.job["delimiter"] }, self.logger, "N") self.logger.info("Cleaning s3 data folder...") S3Utilities.DeleteFileFromS3TempUsingAWSCLi(s3DataFolder, "--recursive --quiet") except Exception: self.logger.error("Error while trying to save into Redshift from s3 folder.") raise def CleanWorkingFolders(self): ''' Ensures the folders are cleaned and ready before the process execution. ''' self.logger.info("Cleaning local working folders...") FileUtilities.RemoveFolder(self.tempFolder) FileUtilities.RemoveFolder(self.packedFolder) FileUtilities.CreateFolder(self.tempFolder) FileUtilities.CreateFolder(self.packedFolder) def ProcessTable(self,table): ''' Process data for the table :param table: :return: ''' s3Key = self.job["s3Filename"] self.logger.info(self.moduleName + " - Processing file: " + s3Key) self.fileUtilities.moduleName = self.moduleName self.fileUtilities.localBaseDirectory = self.localTempDirectory + "/" + table["table"] self.fileUtilities.CreateFolders(self.job["folders"]) fileName = ntpath.basename(s3Key) local7zipFilePath = self.fileUtilities.gzipFolder+ "/" +fileName S3Utilities.DownloadFileFromS3(self.awsParams.s3,self.job["bucketName"], s3Key,local7zipFilePath) localCsvFilepath = self.fileUtilities.csvFolder + "/" + fileName localCsvFilepath = re.sub(r'\.zip$', '', localCsvFilepath) self.fileUtilities.UnzipUsing7z(local7zipFilePath,localCsvFilepath) fileToBeloaded = localCsvFilepath+'/'+'emission_05-11-2017.csv' spark = SparkUtilities.GetCreateSparkSession(self.logger) schema = SparkUtilities.BuildSparkSchema(table) df = (spark.read .format("com.databricks.spark.csv") .options(header='true', delimiter=self.job["delimiter"],ignoreTrailingWhiteSpace='true') .schema(schema) .load(fileToBeloaded) ) #df.show() self.logger.info( self.moduleName + " -- " + "Done reading " + str(df.count()) + " rows. Now saving as parquet file...") SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts(table,self.fileUtilities.parquet) self.logger.info(self.moduleName + " -- " + "UploadFilesCreateAthenaTablesAndSqlScripts " + " finished ") def Start(self, logger, moduleName, filelocs): ''' Start of routine ''' try: ApplicationBase.Start(self, logger, moduleName, filelocs) self.logger.debug(self.moduleName + " -- " + " starting ") currProcId = self.etlUtilities.GetRunID(filelocs["tblEtl"]["table"], self.moduleName) for table in self.job["tables"]: self.ProcessTable(table) if self.job["cleanlocal"] == "Y": self.fileUtilities.RemoveFolder(self.localTempDirectory) self.logger.debug(self.moduleName + " -- " + " finished ") except Exception as err: self.logger.exception(moduleName + " - Exception! Error: " + err.message) if self.etlUtilities.CompleteInstance(filelocs["tblEtl"]["table"], \ currProcId, 'F') is not True: self.logger.info(self.moduleName + " - we could not Complete Instance.") raise Exception(err.message)
class ECRConnect(ApplicationBase): ''' This class is used to get the Risk data from IHS Connect, transform it and load it into Redshift. ''' def __init__(self): ''' Initial settings ''' super(ECRConnect, self).__init__() self.awsParams = "" self.csvFile = None self.csvFileHistory = None self.fileUtilities = FileUtilities(self.logger) self.location = FileUtilities.PathToForwardSlash( os.path.dirname(os.path.abspath(__file__))) def TransformToCsv(self, jData): ''' Transforms from json to csv file. ''' try: # Gets the latest version df = json_normalize(jData, 'Risks', ['Country']) df['ClassName'] = '' df['ClassAvg'] = '' df = df[[ 'Country', 'Name', 'Value', 'Description', 'ClassName', 'ClassAvg', 'UpdatedOn' ]] df.to_csv(self.csvFile, header=False, sep=str(self.job["delimiter"]), encoding='utf-8', index=False) self.fileUtilities.GzipFile(self.csvFile, self.csvFile + ".gz") self.fileUtilities.RemoveFileIfItExists(self.csvFile) # Gets the history df = json_normalize(jData, ['Risks', 'History'], ['Country', ['Risks', 'Name']]) df = df[['Country', 'Risks.Name', 'Value', 'UpdatedOn']] df.to_csv(self.csvFileHistory, header=False, sep=str(self.job["delimiter"]), encoding='utf-8', index=False) self.fileUtilities.GzipFile(self.csvFileHistory, self.csvFileHistory + ".gz") self.fileUtilities.RemoveFileIfItExists(self.csvFileHistory) except Exception as err: self.logger.error( "Error while trying to transform json to csv. Error:" + err.message) raise def GetAndTransform(self): ''' Download all files. ''' try: request = urllib2.Request(self.job["connectAPI"]["baseurl"] + self.job["connectAPI"]["riskService"]) base64string = base64.b64encode( '%s:%s' % (self.job["connectAPI"]["username"], self.job["connectAPI"]["password"])) request.add_header("Authorization", "Basic %s" % base64string) response = urllib2.urlopen(request) jData = json.load(response) self.TransformToCsv(jData) except Exception as err: self.logger.error( "Error while trying to get and transform from IHS Connect API service. Error:" + err.message) raise def LoadAllFromS3(self, s3Source, tableName): ''' Process a single category configured in the categories dictionary in the jobConfig. ''' try: s3DataSource = "s3://" + self.job["bucketName"] + s3Source rsConnect = RedshiftUtilities.Connect( dbname=self.awsParams.redshift['Database'], host=self.awsParams.redshift['Hostname'], port=self.awsParams.redshift['Port'], user=self.awsParams.redshiftCredential['Username'], password=self.awsParams.redshiftCredential['Password']) RedshiftUtilities.LoadDataFromS3( rsConnect, self.awsParams.s3, { "destinationSchema": self.job["destinationSchema"], "tableName": tableName, "s3Filename": s3DataSource, "fileFormat": self.job["fileFormat"], "dateFormat": self.job["dateFormat"], "delimiter": self.job["delimiter"] }, self.logger, "N") except Exception: self.logger.error( self.moduleName + " - Error while trying to save into Redshift from s3 folder.") raise def UploadToS3(self): ''' Uploads all GZIP files created into S3 to be uploaded later... ''' self.logger.info(self.moduleName + " - Uploading GZIP files to s3 folder...") fileName = self.job["fileNameOut"] + ".gz" fileNameHistory = self.job["fileNameOutHistory"] + ".gz" S3Utilities.CopyItemsAWSCli( self.localTempDirectory + "/" + fileName, 's3://' + self.job["bucketName"] + self.job["s3ToDirectory"] + '/' + fileName) S3Utilities.CopyItemsAWSCli( self.localTempDirectory + "/" + fileNameHistory, 's3://' + self.job["bucketName"] + self.job["s3ToDirectory"] + '/' + fileNameHistory) def ExecutePostETL(self): ''' Will execute the post load sql script... ''' try: sqlTemplate = self.location + "/" + self.job["postSQLScript"] sqlScript = self.localTempDirectory + "/" + self.job[ "postSQLScript"] self.fileUtilities.CreateActualFileFromTemplate( sqlTemplate, sqlScript, self.job["destinationSchema"], self.job["tableName"]) RedshiftUtilities.PSqlExecute(sqlScript, self.logger) except Exception as err: self.logger.error( self.moduleName + " - Error while updating the countries codes. Message: " + err.message) raise def Start(self, logger, moduleName, filelocs): try: ApplicationBase.Start(self, logger, moduleName, filelocs) self.csvFile = self.localTempDirectory + "/" + self.job[ "fileNameOut"] self.csvFileHistory = self.localTempDirectory + "/" + self.job[ "fileNameOutHistory"] self.GetAndTransform() self.UploadToS3() self.LoadAllFromS3( self.job["s3ToDirectory"] + '/' + self.job["fileNameOut"] + '.gz', self.job["tableName"]) self.LoadAllFromS3( self.job["s3ToDirectory"] + '/' + self.job["fileNameOutHistory"] + '.gz', self.job["tableName"] + '_history') self.LoadAllFromS3( self.job["xReference"]["s3DataDirectory"], self.job["tableName"] + self.job["xReference"]["tableNameSfx"]) self.ExecutePostETL() except Exception as err: self.logger.exception(moduleName + " - Exception! Error: " + err.message) raise Exception(err.message)