class Vantage(ApplicationBase): ''' This class is used to get the Vanatage data from IHS Vantage Database, transform it and load it into Redshift. ''' def __init__(self): ''' Initial settings ''' super(Vantage, self).__init__() self.awsParams = "" self.packedFolder = None self.rawFolder = None self.fileUtilities = FileUtilities(self.logger) self.location = FileUtilities.PathToForwardSlash( os.path.dirname(os.path.abspath(__file__))) def BulkExtractAll(self): ''' Controls the flow thru the different data sets coming from Vantage DB. ''' try: for dsScript in self.job["extractingScripts"]: self.logger.info(self.moduleName + " Starts extracting " + dsScript["tableSuffix"] + " data...") self.bcpUtilities.RunBCPJob( self.job["mssqlLoginInfo"], self.job["bcpUtilityDirOnLinux"], self.fileUtilities.LoadSQLQuery(self.location + dsScript["scriptFile"]), self.localTempDirectory + "/Raw/" + dsScript["tableSuffix"] + ".CSV", self.job["delimiter"]) except Exception as err: self.logger.error( "Error while trying to Bulk Extract all. Message: " + err.message) raise def TransformAndPackAll(self): ''' Compress the csv files created. ''' rawFiles = self.fileUtilities.ScanFolder(self.rawFolder, None, "CSV") try: for rFile in rawFiles: rFileFull = self.rawFolder + "/" + rFile self.logger.info(self.moduleName + " started compressing file: " + rFile) self.fileUtilities.GzipFile( rFileFull, self.packedFolder + "/" + rFile + ".GZ") self.fileUtilities.RemoveFileIfItExists(rFileFull) except Exception as err: self.logger.error(self.moduleName + " Error while compressing raw files. Message: " + err.message) raise def LoadAllFromS3(self): ''' Load all CSVs from the Vantage's S3 bucket into Redshift ''' rsConnect = None try: s3DataFolder = "s3://" + self.job["bucketName"] + self.job[ "s3ToDirectory"] rsConnect = RedshiftUtilities.Connect( dbname=self.awsParams.redshift['Database'], host=self.awsParams.redshift['Hostname'], port=self.awsParams.redshift['Port'], user=self.awsParams.redshiftCredential['Username'], password=self.awsParams.redshiftCredential['Password']) for dsScript in self.job["extractingScripts"]: RedshiftUtilities.LoadDataFromS3( rsConnect, self.awsParams.s3, { "destinationSchema": self.job["destinationSchema"], "tableName": self.job["tableName"] + dsScript["tableSuffix"], "s3Filename": s3DataFolder + "/" + dsScript["tableSuffix"] + ".CSV.GZ", "fileFormat": self.job["fileFormat"], "dateFormat": self.job["dateFormat"], "delimiter": self.job["delimiter"] }, self.logger, "N") self.logger.info(self.moduleName + " - Cleaning s3 data folder...") S3Utilities.DeleteFileFromS3TempUsingAWSCLi( s3DataFolder, "--recursive --quiet") except Exception: self.logger.error( self.moduleName + " - Error while trying to save into Redshift from s3 folder.") raise finally: if rsConnect is not None: rsConnect.close() def BulkUploadToS3(self): ''' Uploads all GZIP files created into S3 to be uploaded later... ''' self.logger.info(self.moduleName + " - Uploading GZIP files to s3 folder...") S3Utilities.CopyItemsAWSCli( self.packedFolder, "s3://" + self.job["bucketName"] + self.job["s3ToDirectory"], "--recursive --quiet") def Start(self, logger, moduleName, filelocs): try: ApplicationBase.Start(self, logger, moduleName, filelocs) self.packedFolder = self.localTempDirectory + "/Packed" self.rawFolder = self.localTempDirectory + "/Raw" self.fileUtilities.RemoveFolder(self.packedFolder) self.fileUtilities.RemoveFolder(self.rawFolder) self.fileUtilities.CreateFolder(self.packedFolder) self.fileUtilities.CreateFolder(self.rawFolder) self.BulkExtractAll() self.TransformAndPackAll() self.BulkUploadToS3() self.LoadAllFromS3() except Exception as err: self.logger.exception(moduleName + " - Exception! Error: " + err.message) raise Exception(err.message)
class PGCRAirMarkets(ApplicationBase): ''' Code to process the PGCR Air Markets data ''' def __init__(self): ''' Initial settings ''' super(PGCRAirMarkets, self).__init__() self.awsParams = "" self.tempFolder = None self.packedFolder = None self.rawDataFolder = None self.toPackFiles = [] self.fileUtilities = FileUtilities(self.logger) self.location = FileUtilities.PathToForwardSlash( os.path.dirname(os.path.abspath(__file__))) def SynchronizeSourceFolder(self): ''' Synchronize the source folder from the AirMarkets bucket in s3 ''' self.logger.info("Synchronizing ZIP files from s3 folder...") S3Utilities.SyncFolderAWSCli( "s3://" + self.job["bucketName"] + self.job["s3SrcDirectory"], self.rawDataFolder, True) def CleanUpAndPack(self): ''' Main control to iterate thru the folders cleaning the files and packing them to be uploaded to s3. ''' rawFolders = self.fileUtilities.ScanFolder(self.rawDataFolder) for rawFolderName in rawFolders: self.toPackFiles = [] self.DecompressFromRaw(rawFolderName) self.CleanUpRawCSV(rawFolderName) self.PackFiles(rawFolderName) def PackFiles(self, rawFolderName): ''' Compress the files for a given folder, right now is only the emissions file being packed. ''' self.logger.info("Packing files for folder " + rawFolderName + "...") for csvFile in self.toPackFiles: airMarketGzFile = self.packedFolder + "/" + ntpath.basename( csvFile) + ".gz" self.fileUtilities.GzipFile(csvFile, airMarketGzFile) self.fileUtilities.DeleteFile(csvFile) def CleanUpRawCSV(self, rawFolderName): ''' Performs the clean-up for the emissions files replacing bd characters. ''' allFiles = self.fileUtilities.ScanFolder(self.tempFolder, None, "csv") fileList = [ fileName for fileName in allFiles if self.job["srcFileNamePrefix"] in fileName ] fileListToDel = [ fileName for fileName in allFiles if self.job["srcFileNamePrefix"] not in fileName ] self.logger.info("Cleaning up files for folder " + rawFolderName + "...") for airMarketFile in fileList: fullFileName = self.tempFolder + "/" + airMarketFile toPackFileName = self.tempFolder + "/" + self.job[ "srcFileNamePrefix"] + "_" + rawFolderName + ".csv" self.fileUtilities.ReplaceIterativelyInFile( fullFileName, toPackFileName, [{ r"[^\x00-\x76]+": "" }, { "'": "`" }]) self.logger.info("toPackFileName" + toPackFileName) fc = self.fileUtilities.GetFileContents(toPackFileName) if fc is not None and len(fc) > 2: self.logger.info("2nd line from the file") self.logger.info(fc[1].strip()) else: self.logger.info("File is empty") lineCount = self.fileUtilities.GetLineCount(toPackFileName) if lineCount > 0: self.fileUtilities.RemoveLines(toPackFileName, self.job["removeLines"]) self.toPackFiles.append(toPackFileName) else: self.logger.warning("Empty file: " + toPackFileName) self.fileUtilities.DeleteFile(fullFileName) for airMarketFile in fileListToDel: self.fileUtilities.DeleteFile(self.tempFolder + "/" + airMarketFile) def DecompressFromRaw(self, rawFolderName): ''' Extracts the files from the EPADownload.zip file... ''' try: filePath = self.rawDataFolder + "/" + rawFolderName + "/" + self.job[ "inputZipFileName"] self.logger.info("Unpacking file: " + filePath) self.fileUtilities.UnzipUsing7z(filePath, self.tempFolder) except Exception as err: self.logger.info("Unable to decompress file: " + filePath + " Error: " + err.message) def UploadPackedToS3(self): ''' Uploads all files packed to s3. ''' self.logger.info("Uploading GZIP files to s3 folder...") S3Utilities.CopyItemsAWSCli( self.packedFolder, "s3://" + self.job["bucketName"] + self.job["s3ToDirectory"], "--recursive --quiet") def LoadAirMarketsTables(self): ''' Performs the final step to insert multiple files located in s3 into the final table in Redshift. ''' try: s3DataFolder = "s3://" + self.job["bucketName"] + self.job[ "s3ToDirectory"] rsConnect = RedshiftUtilities.Connect( dbname=self.awsParams.redshift['Database'], host=self.awsParams.redshift['Hostname'], port=self.awsParams.redshift['Port'], user=self.awsParams.redshiftCredential['Username'], password=self.awsParams.redshiftCredential['Password']) RedshiftUtilities.LoadDataFromS3( rsConnect, self.awsParams.s3, { "destinationSchema": self.job["destinationSchema"], "tableName": self.job["tableName"] + self.job["srcFileNamePrefix"], "s3Filename": s3DataFolder, "fileFormat": self.job["fileFormat"], "dateFormat": self.job["dateFormat"], "delimiter": self.job["delimiter"] }, self.logger, "N") self.logger.info("Cleaning s3 data folder...") S3Utilities.DeleteFileFromS3TempUsingAWSCLi( s3DataFolder, "--recursive --quiet") except Exception: self.logger.error( "Error while trying to save into Redshift from s3 folder.") raise def CleanWorkingFolders(self): ''' Ensures the folders are cleaned and ready before the process execution. ''' self.logger.info("Cleaning local working folders...") FileUtilities.RemoveFolder(self.tempFolder) FileUtilities.RemoveFolder(self.packedFolder) FileUtilities.CreateFolder(self.tempFolder) FileUtilities.CreateFolder(self.packedFolder) def Start(self, logger, moduleName, filelocs): ''' Start of routine ''' try: ApplicationBase.Start(self, logger, moduleName, filelocs) self.tempFolder = self.localTempDirectory + "/Temp" self.packedFolder = self.localTempDirectory + "/Packed" self.rawDataFolder = self.localTempDirectory + "/RawData" self.CleanWorkingFolders() self.SynchronizeSourceFolder() self.CleanUpAndPack() self.UploadPackedToS3() self.LoadAirMarketsTables() except: logger.exception(moduleName + " - Exception!") raise
class Consensus(ApplicationBase): ''' This class is used to control the data load process from different OPEC file sources. ''' def __init__(self): ''' Initial settings ''' super(Consensus, self).__init__() self.awsParams = "" self.rawFolder = None self.csvFolder = None self.fileUtilities = FileUtilities(self.logger) self.location = FileUtilities.PathToForwardSlash( os.path.dirname(os.path.abspath(__file__))) def BulkDownload(self): ''' Download all files. ''' sharedFiles = self.fileUtilities.ScanFolder( self.job["srcSharedFolder"]) self.logger.info(self.moduleName + " - Downloading files from shared folder...") for fileName in sharedFiles: if (fileName[:2] == self.job["fileNamePrefix"] ) and os.path.splitext(fileName)[1] in self.job["validExts"]: shutil.copyfile( os.path.join(self.job["srcSharedFolder"], fileName), self.rawFolder + "/" + fileName) def DfCleanUp(self, df, surveyDateVal): ''' Converts the actual excel file into csv for the worksheet configured. ''' bankNameColumnIn = "Unnamed: 0" surveyDateColName = "surveyDate" for colName in self.job["columnsToDrop"]: df = df.drop(colName, 1) df = df.drop(self.job["dropAfterHeader"], 0) for colName in df.head(0): dtTest = colName if not isinstance(dtTest, datetime) and colName != bankNameColumnIn: df = df.drop(colName, 1) df = df.assign(surveyDate=surveyDateVal) newOrder = [surveyDateColName] for colName in df.head(0): if colName != surveyDateColName: newOrder.append(colName) df = df[newOrder] df = df.melt(id_vars=[surveyDateColName, bankNameColumnIn]) return df def LoadAllFromS3(self): ''' Process a single category configured in the categories dictionary in the jobConfig. ''' try: s3DataFolder = "s3://" + self.job["bucketName"] + self.job[ "s3ToDirectory"] rsConnect = RedshiftUtilities.Connect( dbname=self.awsParams.redshift['Database'], host=self.awsParams.redshift['Hostname'], port=self.awsParams.redshift['Port'], user=self.awsParams.redshiftCredential['Username'], password=self.awsParams.redshiftCredential['Password']) RedshiftUtilities.LoadDataFromS3( rsConnect, self.awsParams.s3, { "destinationSchema": self.job["destinationSchema"], "tableName": self.job["tableName"], "s3Filename": s3DataFolder, "fileFormat": self.job["fileFormat"], "dateFormat": self.job["dateFormat"], "delimiter": self.job["delimiter"] }, self.logger, "N") self.logger.info(self.moduleName + " - Cleaning s3 data folder...") S3Utilities.DeleteFileFromS3TempUsingAWSCLi( s3DataFolder, "--recursive --quiet") except Exception: self.logger.error( self.moduleName + " - Error while trying to save into Redshift from s3 folder.") raise def GetData(self, rawFileName, mode=None): ''' Returns the data frame or survey date ''' if mode == "getSurveyDate": skipRows = 0 else: skipRows = self.job["skipRows"] df = pandas.read_excel(rawFileName, sheetname=self.job["worksheetName"], index_col=None, na_values=["na"], skiprows=skipRows, skip_footer=self.job["skipFooter"]) if mode == "getSurveyDate": valRerturn = df.iloc[self.job["surveyDateRow"] - 2][0] else: valRerturn = df return valRerturn @staticmethod def FormatSurveyDate(emFile): ''' Returns the date based on the file's name ''' surveyDateColValue = os.path.splitext(emFile)[0] surveyDateColValue = surveyDateColValue[2:len(surveyDateColValue)] surveyDateColValue = surveyDateColValue.replace("CF", "") surveyDateColValue = str(surveyDateColValue[3:]) + "-" + str( list(calendar.month_abbr).index(surveyDateColValue[:3])) + "-01" return surveyDateColValue def ProcessFiles(self): ''' Controls the workflow for the conversion, clean up and pack of the input files. ''' filesToProcess = self.fileUtilities.ScanFolder(self.rawFolder) for emFile in filesToProcess: self.logger.info(self.moduleName + " - Processing file: " + emFile) rawFileName = self.rawFolder + "/" + emFile csvFilename = self.csvFolder + "/" + os.path.splitext( emFile)[0] + ".csv" try: surveyDatedt = self.GetData(rawFileName, "getSurveyDate") if isinstance(surveyDatedt, float): surveyDatedt = self.FormatSurveyDate(emFile) df = self.GetData(rawFileName) df = self.DfCleanUp(df, surveyDatedt) df.to_csv(csvFilename, header=False, sep=str(self.job["delimiter"]), encoding='utf-8', index=False) self.fileUtilities.GzipFile(csvFilename, csvFilename + ".gz") self.fileUtilities.DeleteFile(csvFilename) except XLRDError: self.logger.info(self.moduleName + " - No tab named '" + self.job["worksheetName"] + "' in " + emFile) except Exception: self.logger.error(self.moduleName + " - Error while trying to process " + emFile) raise finally: FileUtilities.RemoveFileIfItExists(rawFileName) def CheckWorkingFolders(self): ''' Check if the working folders are out there to re-create them ''' self.logger.info(self.moduleName + "Checking on working folders...") FileUtilities.RemoveFolder(self.rawFolder) FileUtilities.RemoveFolder(self.csvFolder) FileUtilities.CreateFolder(self.rawFolder) FileUtilities.CreateFolder(self.csvFolder) def BulkUploadToS3(self): ''' Uploads all GZIP files created into S3 to be uploaded later... ''' self.logger.info(self.moduleName + " - Uploading GZIP files to s3 folder...") S3Utilities.CopyItemsAWSCli( self.csvFolder, "s3://" + self.job["bucketName"] + self.job["s3ToDirectory"], "--recursive --quiet") def Start(self, logger, moduleName, filelocs): currProcId = None try: ApplicationBase.Start(self, logger, moduleName, filelocs) self.logger.debug(self.moduleName + " -- " + " starting ") currProcId = self.etlUtilities.GetRunID( filelocs["tblEtl"]["table"], self.moduleName) self.rawFolder = self.localTempDirectory + "/" + "Raw" self.csvFolder = self.localTempDirectory + "/" + "CSV" self.CheckWorkingFolders() self.BulkDownload() self.ProcessFiles() self.BulkUploadToS3() self.LoadAllFromS3() if self.job["cleanlocal"] == "Y": self.fileUtilities.RemoveFolder(self.localTempDirectory) self.logger.debug(self.moduleName + " -- " + " finished ") except Exception as err: self.logger.exception(moduleName + " - Exception! Error: " + err.message) if self.etlUtilities.CompleteInstance(filelocs["tblEtl"]["table"],\ currProcId, 'F') is not True: self.logger.info(self.moduleName + " - we could not Complete Instance.") raise Exception(err.message)
class AutoInsight(ApplicationBase): ''' This class is used to control the data load process for Auto Insight. ''' def __init__(self): ''' Initial settings ''' super(AutoInsight, self).__init__() self.awsParams = "" self.processingFile = None self.fileUtilities = FileUtilities(self.logger) self.location = FileUtilities.PathToForwardSlash(os.path.dirname(os.path.abspath(__file__))) def BulkDownload(self): ''' Download all files. ''' sharedFiles = self.fileUtilities.ScanFolder(self.job["srcSharedFolder"]) self.logger.info(self.moduleName + " - Downloading files from shared folder...") for fileName in sharedFiles: if fileName == self.job["fileName"]: self.processingFile = fileName shutil.copyfile(os.path.join(self.job["srcSharedFolder"], fileName), self.localTempDirectory + "/" + self.processingFile) def LoadAllFromS3(self): ''' Process a single category configured in the categories dictionary in the jobConfig. ''' try: s3DataFolder = "s3://" + self.job["bucketName"] + self.job["s3ToDirectory"] rsConnect = RedshiftUtilities.Connect(dbname=self.awsParams.redshift['Database'], host=self.awsParams.redshift['Hostname'], port=self.awsParams.redshift['Port'], user=self.awsParams.redshiftCredential['Username'], password=self.awsParams.redshiftCredential['Password']) RedshiftUtilities.LoadDataFromS3(rsConnect, self.awsParams.s3, { "destinationSchema": self.job["destinationSchema"], "tableName": self.job["ddl"]["table"], "s3Filename": s3DataFolder, "fileFormat": self.job["fileFormat"], "dateFormat": self.job["dateFormat"], "delimiter": self.job["delimiter"] }, self.logger, "N") self.logger.info(self.moduleName + " - Cleaning s3 data folder...") S3Utilities.DeleteFileFromS3TempUsingAWSCLi(s3DataFolder, "--recursive --quiet") except Exception: self.logger.error(self.moduleName + " - Error while trying to save into Redshift from s3 folder.") raise @staticmethod def FormatColNameDate(dtText): ''' Converts the abbreviated date to YYYY-MM-01 format ''' textFixed = None if sys.version[0] == '2': customException = StandardError() elif sys.version[0] == '3': customException = Exception() try: textFixed = str(dtText[4:]) + "-" + str(list(calendar.month_abbr).index(dtText[:3])) + "-01" except customException: textFixed = dtText return textFixed def ProcessFiles(self): ''' Controls the workflow for the conversion, clean up and pack of the input files. ''' self.logger.info(self.moduleName + " - Processing file: " + self.processingFile) rawFileName = self.localTempDirectory + "/" + self.processingFile csvFilename = self.localTempDirectory + "/" + self.processingFile.split(".")[0] + ".csv" try: columnNames = [] df = pandas.read_excel(rawFileName, sheetname=self.job["worksheetName"], index_col=None, na_values=None, skiprows=self.job["skipRows"], skip_footer=self.job["skipFooter"]) for colName in df.head(0): if colName not in self.job["columns_no_melt"]: columnNames.append(self.FormatColNameDate(colName)) else: columnNames.append(colName) df.columns = columnNames df = df.melt(id_vars=self.job["columns_no_melt"]) df.to_csv(csvFilename, header=False, sep=str(self.job["delimiter"]), encoding='utf-8', index=False) self.fileUtilities.GzipFile(csvFilename, csvFilename + ".gz") self.fileUtilities.DeleteFile(csvFilename) except XLRDError: self.logger.info(self.moduleName + " - No tab named '" + self.job["worksheetName"] + "' in " + self.processingFile) except Exception: self.logger.error(self.moduleName + " - Error while trying to process file " + self.processingFile) raise finally: FileUtilities.RemoveFileIfItExists(rawFileName) def BulkUploadToS3(self): ''' Uploads all GZIP files created into S3 to be uploaded later... ''' self.logger.info(self.moduleName + " - Uploading GZIP files to s3 folder...") fileName = self.processingFile.split(".")[0] + ".csv.gz" S3Utilities.CopyItemsAWSCli(self.localTempDirectory + "/" + fileName, "s3://" + self.job["bucketName"] + self.job["s3ToDirectory"], "--quiet") def ExecuteCreateTable(self): ''' Checks if the tables needs to be created ''' tb = self.job['ddl'] tb['schemaName'] = self.job['destinationSchema'] fname = self.fileUtilities.CreateTableSql(tb, self.localTempDirectory) RedshiftUtilities.PSqlExecute(fname, self.logger) def Start(self, logger, moduleName, filelocs): currProcId = None try: ApplicationBase.Start(self, logger, moduleName, filelocs) self.logger.debug(self.moduleName + " -- " + " starting ") currProcId = self.etlUtilities.GetRunID(filelocs["tblEtl"]["table"], self.moduleName) self.ExecuteCreateTable() self.BulkDownload() self.ProcessFiles() self.BulkUploadToS3() self.LoadAllFromS3() if self.job["cleanlocal"] == "Y": self.fileUtilities.RemoveFolder(self.localTempDirectory) self.logger.debug(self.moduleName + " -- " + " finished ") except Exception as err: self.logger.exception(moduleName + " - Exception! Error: " + err.message) if self.etlUtilities.CompleteInstance(filelocs["tblEtl"]["table"], currProcId, 'F') is not True: self.logger.info(self.moduleName + " - we could not Complete Instance.") raise Exception()
class PGCRAirMarketsAthenaSpark(ApplicationBase): ''' Code to process the PGCR Air Markets data ''' def __init__(self): ''' Initial settings ''' super(PGCRAirMarketsAthenaSpark, self).__init__() self.awsParams = "" self.tempFolder = None self.packedFolder = None self.rawDataFolder = None self.toPackFiles = [] self.fileUtilities = FileUtilities(self.logger) self.location = FileUtilities.PathToForwardSlash(os.path.dirname(os.path.abspath(__file__))) def SynchronizeSourceFolder(self): ''' Synchronize the source folder from the AirMarkets bucket in s3 ''' self.logger.info("Synchronizing ZIP files from s3 folder...") S3Utilities.SyncFolderAWSCli("s3://" + self.job["bucketName"] + self.job["s3SrcDirectory"], self.rawDataFolder, True) def CleanUpAndPack(self): ''' Main control to iterate thru the folders cleaning the files and packing them to be uploaded to s3. ''' rawFolders = self.fileUtilities.ScanFolder(self.rawDataFolder) for rawFolderName in rawFolders: self.toPackFiles = [] self.DecompressFromRaw(rawFolderName) self.CleanUpRawCSV(rawFolderName) self.PackFiles(rawFolderName) def PackFiles(self, rawFolderName): ''' Compress the files for a given folder, right now is only the emissions file being packed. ''' self.logger.info("Packing files for folder " + rawFolderName + "...") for csvFile in self.toPackFiles: airMarketGzFile = self.packedFolder + "/" + ntpath.basename(csvFile) + ".gz" self.fileUtilities.GzipFile(csvFile, airMarketGzFile) self.fileUtilities.DeleteFile(csvFile) def CleanUpRawCSV(self, rawFolderName): ''' Performs the clean-up for the emissions files replacing bd characters. ''' allFiles = self.fileUtilities.ScanFolder(self.tempFolder, None, "csv") fileList = [fileName for fileName in allFiles if self.job["srcFileNamePrefix"] in fileName] fileListToDel = [fileName for fileName in allFiles if self.job["srcFileNamePrefix"] not in fileName] self.logger.info("Cleaning up files for folder " + rawFolderName + "...") for airMarketFile in fileList: fullFileName = self.tempFolder + "/" + airMarketFile toPackFileName = self.tempFolder + "/" + self.job["srcFileNamePrefix"] + "_" + rawFolderName + ".csv" self.fileUtilities.ReplaceIterativelyInFile(fullFileName, toPackFileName, [{r"[^\x00-\x76]+":""}, {"'":"`"}]) self.fileUtilities.RemoveLines(toPackFileName, self.job["removeLines"]) self.toPackFiles.append(toPackFileName) self.fileUtilities.DeleteFile(fullFileName) for airMarketFile in fileListToDel: self.fileUtilities.DeleteFile(self.tempFolder + "/" + airMarketFile) def DecompressFromRaw(self, rawFolderName): ''' Extracts the files from the EPADownload.zip file... ''' try: filePath = self.rawDataFolder + "/" + rawFolderName + "/" + self.job["inputZipFileName"] self.logger.info("Unpacking file: " + filePath) self.fileUtilities.UnzipUsing7z(filePath, self.tempFolder) except StandardError as err: self.logger.info("Unable to decompress file: " + filePath + " Error: " + err.message) def UploadPackedToS3(self): ''' Uploads all files packed to s3. ''' self.logger.info("Uploading GZIP files to s3 folder...") S3Utilities.CopyItemsAWSCli(self.packedFolder, "s3://" + self.job["bucketName"] + self.job["s3ToDirectory"], "--recursive --quiet") def LoadAirMarketsTables(self): ''' Performs the final step to insert multiple files located in s3 into the final table in Redshift. ''' try: s3DataFolder = "s3://" + self.job["bucketName"] + self.job["s3ToDirectory"] rsConnect = RedshiftUtilities.Connect(dbname=self.awsParams.redshift['Database'], host=self.awsParams.redshift['Hostname'], port=self.awsParams.redshift['Port'], user=self.awsParams.redshiftCredential['Username'], password=self.awsParams.redshiftCredential['Password']) RedshiftUtilities.LoadDataFromS3(rsConnect, self.awsParams.s3, { "destinationSchema": self.job["destinationSchema"], "tableName": self.job["tableName"] + self.job["srcFileNamePrefix"], "s3Filename": s3DataFolder, "fileFormat": self.job["fileFormat"], "dateFormat": self.job["dateFormat"], "delimiter": self.job["delimiter"] }, self.logger, "N") self.logger.info("Cleaning s3 data folder...") S3Utilities.DeleteFileFromS3TempUsingAWSCLi(s3DataFolder, "--recursive --quiet") except Exception: self.logger.error("Error while trying to save into Redshift from s3 folder.") raise def CleanWorkingFolders(self): ''' Ensures the folders are cleaned and ready before the process execution. ''' self.logger.info("Cleaning local working folders...") FileUtilities.RemoveFolder(self.tempFolder) FileUtilities.RemoveFolder(self.packedFolder) FileUtilities.CreateFolder(self.tempFolder) FileUtilities.CreateFolder(self.packedFolder) def ProcessTable(self,table): ''' Process data for the table :param table: :return: ''' s3Key = self.job["s3Filename"] self.logger.info(self.moduleName + " - Processing file: " + s3Key) self.fileUtilities.moduleName = self.moduleName self.fileUtilities.localBaseDirectory = self.localTempDirectory + "/" + table["table"] self.fileUtilities.CreateFolders(self.job["folders"]) fileName = ntpath.basename(s3Key) local7zipFilePath = self.fileUtilities.gzipFolder+ "/" +fileName S3Utilities.DownloadFileFromS3(self.awsParams.s3,self.job["bucketName"], s3Key,local7zipFilePath) localCsvFilepath = self.fileUtilities.csvFolder + "/" + fileName localCsvFilepath = re.sub(r'\.zip$', '', localCsvFilepath) self.fileUtilities.UnzipUsing7z(local7zipFilePath,localCsvFilepath) fileToBeloaded = localCsvFilepath+'/'+'emission_05-11-2017.csv' spark = SparkUtilities.GetCreateSparkSession(self.logger) schema = SparkUtilities.BuildSparkSchema(table) df = (spark.read .format("com.databricks.spark.csv") .options(header='true', delimiter=self.job["delimiter"],ignoreTrailingWhiteSpace='true') .schema(schema) .load(fileToBeloaded) ) #df.show() self.logger.info( self.moduleName + " -- " + "Done reading " + str(df.count()) + " rows. Now saving as parquet file...") SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts(table,self.fileUtilities.parquet) self.logger.info(self.moduleName + " -- " + "UploadFilesCreateAthenaTablesAndSqlScripts " + " finished ") def Start(self, logger, moduleName, filelocs): ''' Start of routine ''' try: ApplicationBase.Start(self, logger, moduleName, filelocs) self.logger.debug(self.moduleName + " -- " + " starting ") currProcId = self.etlUtilities.GetRunID(filelocs["tblEtl"]["table"], self.moduleName) for table in self.job["tables"]: self.ProcessTable(table) if self.job["cleanlocal"] == "Y": self.fileUtilities.RemoveFolder(self.localTempDirectory) self.logger.debug(self.moduleName + " -- " + " finished ") except Exception as err: self.logger.exception(moduleName + " - Exception! Error: " + err.message) if self.etlUtilities.CompleteInstance(filelocs["tblEtl"]["table"], \ currProcId, 'F') is not True: self.logger.info(self.moduleName + " - we could not Complete Instance.") raise Exception(err.message)
class ConsensusAthenaSpark(ApplicationBase): ''' This class is used to control the data load process from different OPEC file sources. ''' def __init__(self): ''' Initial settings ''' super(ConsensusAthenaSpark, self).__init__() self.rawFolder = None self.fileUtilities = FileUtilities(self.logger) self.location = FileUtilities.PathToForwardSlash( os.path.dirname(os.path.abspath(__file__))) def BulkDownload(self): ''' Download all files. ''' sharedFiles = self.fileUtilities.ScanFolder( self.job["srcSharedFolder"]) self.logger.info(self.moduleName + " - Downloading files from shared folder...") for fileName in sharedFiles: if (fileName[:2] == self.job["fileNamePrefix"] ) and os.path.splitext(fileName)[1] in self.job["validExts"]: shutil.copyfile( os.path.join(self.job["srcSharedFolder"], fileName), self.rawFolder + "/" + fileName) def DfCleanUp(self, df, surveyDateVal): ''' Converts the actual excel file into csv for the worksheet configured. ''' bankNameColumnIn = "Unnamed: 0" surveyDateColName = "surveyDate" for colName in self.job["columnsToDrop"]: df = df.drop(colName, 1) df = df.drop(self.job["dropAfterHeader"], 0) for colName in df.head(0): dtTest = colName if not isinstance(dtTest, datetime) and colName != bankNameColumnIn: df = df.drop(colName, 1) df = df.assign(surveyDate=surveyDateVal) newOrder = [surveyDateColName] for colName in df.head(0): if colName != surveyDateColName: newOrder.append(colName) df = df[newOrder] df = df.melt(id_vars=[surveyDateColName, bankNameColumnIn]) return df def GetData(self, rawFileName, mode=None): ''' Returns the data frame or survey date ''' if mode == "getSurveyDate": skipRows = 0 else: skipRows = self.job["skipRows"] df = pandas.read_excel(rawFileName, sheetname=self.job["worksheetName"], index_col=None, na_values=["na"], skiprows=skipRows, skip_footer=self.job["skipFooter"]) if mode == "getSurveyDate": valRerturn = df.iloc[self.job["surveyDateRow"] - 2][0] else: valRerturn = df return valRerturn @staticmethod def FormatSurveyDate(emFile): ''' Returns the date based on the file's name ''' surveyDateColValue = os.path.splitext(emFile)[0] surveyDateColValue = surveyDateColValue[2:len(surveyDateColValue)] surveyDateColValue = surveyDateColValue.replace("CF", "") surveyDateColValue = str(surveyDateColValue[3:]) + "-" + str( list(calendar.month_abbr).index(surveyDateColValue[:3])) + "-01" return surveyDateColValue def ProcessFiles(self): ''' Controls the workflow for the conversion, clean up and pack of the input files. ''' filesToProcess = self.fileUtilities.ScanFolder(self.rawFolder) for emFile in filesToProcess: self.logger.info(self.moduleName + " - Processing file: " + emFile) rawFileName = self.rawFolder + "/" + emFile csvFilename = self.fileUtilities.csvFolder + os.path.splitext( emFile)[0] + ".csv" try: surveyDatedt = self.GetData(rawFileName, "getSurveyDate") if isinstance(surveyDatedt, float): surveyDatedt = self.FormatSurveyDate(emFile) elif isinstance(surveyDatedt, basestring): if "," in surveyDatedt: tmpDatedt = datetime.strptime(surveyDatedt, '%B %d, %Y') surveyDatedt = datetime.strftime(tmpDatedt, "%Y-%m-%d") df = self.GetData(rawFileName) df = self.DfCleanUp(df, surveyDatedt) df.to_csv(csvFilename, header=False, sep=str(self.job["delimiter"]), encoding='utf-8', index=False) except XLRDError: self.logger.info(self.moduleName + " - No tab named '" + self.job["worksheetName"] + "' in " + emFile) except Exception: self.logger.error(self.moduleName + " - Error while trying to process " + emFile) raise finally: FileUtilities.RemoveFileIfItExists(rawFileName) def ProcessTables(self, dbCommon, tables): ''' process steps: pulls file from share and place in raw folder ''' try: self.rawFolder = self.localTempDirectory + "/" + "Raw" self.BulkDownload() self.ProcessFiles() spark = SparkUtilities.GetCreateSparkSession(self.logger) df = SparkUtilities.ReadCSVFile(spark, tables, self.job["delimiter"], False, self.fileUtilities.csvFolder, self.logger) SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts( tables, self.fileUtilities.parquet) self.fileUtilities.EmptyFolderContents( self.fileUtilities.csvFolder) if "loadToRedshift" in tables and tables["loadToRedshift"] == "Y": self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables) except: self.logger.exception(self.moduleName + " - we had an error in ProcessRequest") raise def Start(self, logger, moduleName, filelocs): ''' Start of routine ''' ApplicationBase.Start(self, logger, moduleName, filelocs) # At some point this will be part of Start ApplicationBase.ProcessInput(self, logger, moduleName, filelocs)